def filterSeq(seq_file, filter_func, filter_args={}, out_args=default_out_args, 
              nproc=None, queue_size=None):
    """
    Filters sequences by fraction of ambiguous nucleotides
    
    Arguments: 
    seq_file = the sequence file to filter
    filter_func = the function to use for filtering sequences
    filter_args = a dictionary of arguments to pass to filter_func
    out_args = common output argument dictionary from parseCommonArgs
    nproc = the number of processQueue processes;
            if None defaults to the number of CPUs
    queue_size = maximum size of the argument queue;
                 if None defaults to 2*nproc
                 
    Returns:
    a list of successful output file names
    """
    # Define output file label dictionary
    cmd_dict = {filterLength:'length', filterMissing:'missing', 
                filterRepeats:'repeats', filterQuality:'quality', 
                maskQuality:'maskqual', trimQuality:'trimqual'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'FilterSeq'
    log['COMMAND'] = cmd_dict.get(filter_func, filter_func.__name__)
    log['FILE'] = os.path.basename(seq_file)
    for k in sorted(filter_args):  log[k.upper()] = filter_args[k]
    log['NPROC'] = nproc
    printLog(log)
    
    # Check input type
    in_type = getFileType(seq_file)
    if in_type != 'fastq' and filter_func in (filterQuality, maskQuality, trimQuality):
        sys.exit('ERROR:  Input file must be FASTQ for %s mode' % cmd_dict[filter_func])
    
    # Define feeder function and arguments
    feed_func = feedSeqQueue
    feed_args = {'seq_file': seq_file}
    # Define worker function and arguments
    work_func = processSeqQueue
    work_args = {'process_func': filter_func, 
                 'process_args': filter_args}
    # Define collector function and arguments
    collect_func = collectSeqQueue
    collect_args = {'seq_file': seq_file,
                    'task_label': cmd_dict[filter_func],
                    'out_args': out_args}
    
    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, 
                             feed_args, work_args, collect_args, 
                             nproc, queue_size)
        
    # Print log
    result['log']['END'] = 'FilterSeq'
    printLog(result['log'])
        
    return result['out_files']
def clusterSets(seq_file, barcode_field=default_barcode_field,
                cluster_field=default_cluster_field,
                ident=default_ident, seq_start=None, seq_end=None,
                usearch_exec=default_usearch_exec,
                out_args=default_out_args, nproc=None,
                queue_size=None):
    """
    Performs clustering on sets of sequences

    Arguments:
    seq_file = the sample sequence file name
    barcode_field = the annotation containing set IDs
    ident = the identity threshold for clustering sequences
    seq_start = the start position to trim sequences at before clustering
    seq_end = the end position to trim sequences at before clustering
    usearch_exec = the path to the executable for usearch
    nproc = the number of processQueue processes;
            if None defaults to the number of CPUs
    queue_size = maximum size of the argument queue;
                 if None defaults to 2*nproc

    Returns:
    the clustered output file name
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'ClusterSets'
    log['FILE'] = os.path.basename(seq_file)
    log['BARCODE_FIELD'] = barcode_field
    log['CLUSTER_FIELD'] = cluster_field
    log['IDENTITY'] = ident
    log['SEQUENCE_START'] = seq_start
    log['SEQUENCE_END'] = seq_end
    log['NPROC'] = nproc
    printLog(log)

    # Define cluster function parameters
    cluster_args = {'usearch_exec':usearch_exec,
                    'ident':ident,
                    'seq_start':seq_start,
                    'seq_end':seq_end}

    # Define feeder function and arguments
    index_args = {'field': barcode_field, 'delimiter': out_args['delimiter']}
    feed_func = feedSeqQueue
    feed_args = {'seq_file': seq_file,
                 'index_func': indexSeqSets,
                 'index_args': index_args}
    # Define worker function and arguments
    work_func = processCSQueue
    work_args = {'cluster_field': cluster_field,
                 'cluster_args': cluster_args,
                 'delimiter': out_args['delimiter']}
    # Define collector function and arguments
    collect_func = collectSeqQueue
    collect_args = {'seq_file': seq_file,
                    'task_label': 'cluster',
                    'out_args': out_args,
                    'index_field': barcode_field}

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func,
                             feed_args, work_args, collect_args,
                             nproc, queue_size)

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = result['log'].pop('OUTPUT')
    for k, v in result['log'].items():  log[k] = v
    log['END'] = 'ClusterSets'
    printLog(log)

    return result['out_files']
Esempio n. 3
0
def filterSeq(seq_file,
              filter_func,
              filter_args={},
              out_file=None,
              out_args=default_out_args,
              nproc=None,
              queue_size=None):
    """
    Filters sequences by fraction of ambiguous nucleotides
    
    Arguments: 
      seq_file : the sequence file to filter.
      filter_func : the function to use for filtering sequences.
      filter_args : a dictionary of arguments to pass to filter_func.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
      nproc : the number of processQueue processes;
              if None defaults to the number of CPUs.
      queue_size : maximum size of the argument queue;
                   if None defaults to 2*nproc.
                 
    Returns:
      list: a list of successful output file names
    """
    # Define output file label dictionary
    cmd_dict = {
        filterLength: 'length',
        filterMissing: 'missing',
        filterRepeats: 'repeats',
        filterQuality: 'quality',
        maskQuality: 'maskqual',
        trimQuality: 'trimqual'
    }

    # Print parameter info
    log = OrderedDict()
    log['START'] = 'FilterSeq'
    log['COMMAND'] = cmd_dict.get(filter_func, filter_func.__name__)
    log['FILE'] = os.path.basename(seq_file)
    for k in sorted(filter_args):
        log[k.upper()] = filter_args[k]
    log['NPROC'] = nproc
    printLog(log)

    # Check input type
    in_type = getFileType(seq_file)
    if in_type != 'fastq' and filter_func in (filterQuality, maskQuality,
                                              trimQuality):
        printError('Input file must be FASTQ for %s mode.' %
                   cmd_dict[filter_func])

    # Define feeder function and arguments
    feed_func = feedSeqQueue
    feed_args = {'seq_file': seq_file}
    # Define worker function and arguments
    work_func = processSeqQueue
    work_args = {'process_func': filter_func, 'process_args': filter_args}
    # Define collector function and arguments
    collect_func = collectSeqQueue
    collect_args = {
        'seq_file': seq_file,
        'label': cmd_dict[filter_func],
        'out_file': out_file,
        'out_args': out_args
    }

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, feed_args,
                             work_args, collect_args, nproc, queue_size)

    # Print log
    result['log']['END'] = 'FilterSeq'
    printLog(result['log'])

    return result['out_files']
Esempio n. 4
0
def alignRecords(db_file,
                 seq_fields,
                 group_func,
                 align_func,
                 group_args={},
                 align_args={},
                 out_args=default_out_args,
                 nproc=None,
                 queue_size=None):
    """
    Performs a multiple alignment on sets of sequences

    Arguments: 
      db_file : filename of the input database.
      seq_fields : the sequence fields to multiple align.
      group_func : function to use to group records.
      align_func : function to use to multiple align sequence groups.
      group_args : dictionary of arguments to pass to group_func.
      align_args : dictionary of arguments to pass to align_func.
      out_args : common output argument dictionary from parseCommonArgs.
      nproc : the number of processQueue processes.
              if None defaults to the number of CPUs.
      queue_size : maximum size of the argument queue.
                   if None defaults to 2*nproc.
                      
    Returns: 
      tuple : a tuple of (align-pass, align-fail) filenames.
    """
    # Define subcommand label dictionary
    cmd_dict = {
        alignAcross: 'across',
        alignWithin: 'within',
        alignBlocks: 'block'
    }

    # Print parameter info
    log = OrderedDict()
    log['START'] = 'AlignRecords'
    log['COMMAND'] = cmd_dict.get(align_func, align_func.__name__)
    log['FILE'] = os.path.basename(db_file)
    log['SEQ_FIELDS'] = ','.join(seq_fields)
    if 'group_fields' in group_args:
        log['GROUP_FIELDS'] = ','.join(group_args['group_fields'])
    if 'mode' in group_args: log['MODE'] = group_args['mode']
    if 'action' in group_args: log['ACTION'] = group_args['action']
    log['NPROC'] = nproc
    printLog(log)

    # Define feeder function and arguments
    feed_func = feedDbQueue
    feed_args = {
        'db_file': db_file,
        'group_func': group_func,
        'group_args': group_args
    }
    # Define worker function and arguments
    align_args['seq_fields'] = seq_fields
    work_func = processDbQueue
    work_args = {'process_func': align_func, 'process_args': align_args}
    # Define collector function and arguments
    collect_func = collectDbQueue
    collect_args = {
        'db_file': db_file,
        'task_label': 'align',
        'out_args': out_args,
        'add_fields': ['%s_ALIGN' % f for f in seq_fields]
    }

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, feed_args,
                             work_args, collect_args, nproc, queue_size)

    # Print log
    result['log']['END'] = 'AlignRecords'
    printLog(result['log'])

    return result['out_files']
Esempio n. 5
0
def alignSets(seq_file, align_func, align_args, barcode_field=default_barcode_field,
              calc_div=False, out_file=None, out_args=default_out_args,
              nproc=None, queue_size=None):
    """
    Performs a multiple alignment on sets of sequences

    Arguments:
      seq_file : the sample sequence file name.
      align_func : the function to use to align sequence sets.
      align_args : a dictionary of arguments to pass to align_func.
      barcode_field : the annotation containing set IDs.
      calc_div : if True calculate average pairwise error for each sequence set.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs
      nproc : the number of processQueue processes;
              if None defaults to the number of CPUs
      queue_size : maximum size of the argument queue;
                   if None defaults to 2*nproc

    Returns: 
      tuple: a tuple of (passing, failing) filenames.
    """
    # Define subcommand label dictionary
    cmd_dict = {runMuscle:'muscle', offsetSeqSet:'offset'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'AlignSets'
    log['COMMAND'] = cmd_dict[align_func]
    log['FILE'] = os.path.basename(seq_file)
    if 'mode' in align_args: log['MODE'] = align_args['mode']
    log['BARCODE_FIELD'] = barcode_field
    if 'field' in align_args: log['OFFSET_FIELD'] = align_args['field']
    log['CALC_DIV'] = calc_div
    log['NPROC'] = nproc
    printLog(log)
 
    # Define feeder function and arguments
    index_args = {'field': barcode_field, 'delimiter': out_args['delimiter']}
    feed_func = feedSeqQueue
    feed_args = {'seq_file': seq_file,
                 'index_func': indexSeqSets, 
                 'index_args': index_args}
    # Define worker function and arguments
    work_func = processQueue
    work_args = {'align_func': align_func, 
                 'align_args': align_args,
                 'calc_div': calc_div,
                 'delimiter': out_args['delimiter']}
    # Define collector function and arguments
    collect_func = collectSeqQueue
    collect_args = {'seq_file': seq_file,
                    'label': 'align',
                    'out_file': out_file,
                    'out_args': out_args,
                    'index_field': barcode_field}
    
    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, 
                             feed_args, work_args, collect_args, 
                             nproc, queue_size)
        
    # Print log
    result['log']['END'] = 'AlignSets'
    printLog(result['log'])
        
    return result['out_files']
Esempio n. 6
0
def buildConsensus(seq_file,
                   barcode_field=default_barcode_field,
                   min_count=default_consensus_min_count,
                   min_freq=default_consensus_min_freq,
                   min_qual=default_consensus_min_qual,
                   primer_field=None,
                   primer_freq=None,
                   max_gap=None,
                   max_error=None,
                   max_diversity=None,
                   copy_fields=None,
                   copy_actions=None,
                   dependent=False,
                   out_file=None,
                   out_args=default_out_args,
                   nproc=None,
                   queue_size=None):
    """
    Generates consensus sequences

    Arguments: 
      seq_file : the sample sequence file name
      barcode_field : the annotation field containing set IDs
      min_count : threshold number of sequences to define a consensus
      min_freq : the frequency cutoff to assign a base
      min_qual : the quality cutoff to assign a base
      primer_field : the annotation field containing primer tags;
                     if None do not annotate with primer tags
      primer_freq : the maximum primer tag frequency that must be meet to build a consensus;
                    if None do not filter by primer frequency
      max_gap : the maximum frequency of (., -) characters allowed before
                deleting a position; if None do not delete positions
      max_error : a threshold defining the maximum allowed error rate to retain a read group;
                  if None do not calculate error rate
      max_diversity : a threshold defining the average pairwise error rate required to retain a read group;
                      if None do not calculate diversity
      dependent : if False treat barcode group sequences as independent data
      copy_fields : a list of annotations to copy into consensus sequence annotations;
                    if None no additional annotations will be copied
      copy_actions : the list of actions to take for each copy_fields;
                     one of ['set', 'majority', 'min', 'max', 'sum']
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
      nproc : the number of processQueue processes;
            if None defaults to the number of CPUs
      queue_size : maximum size of the argument queue;
                 if None defaults to 2*nproc
                    
    Returns: 
      list : a list of successful output file names.
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'BuildConsensus'
    log['FILE'] = os.path.basename(seq_file)
    log['BARCODE_FIELD'] = barcode_field
    log['MIN_COUNT'] = min_count
    log['MIN_FREQUENCY'] = min_freq
    log['MIN_QUALITY'] = min_qual
    log['MAX_GAP'] = max_gap
    log['PRIMER_FIELD'] = primer_field
    log['PRIMER_FREQUENCY'] = primer_freq
    log['MAX_ERROR'] = max_error
    log['MAX_DIVERSITY'] = max_diversity
    log['DEPENDENT'] = dependent
    log['COPY_FIELDS'] = ','.join(
        copy_fields) if copy_fields is not None else None
    log['COPY_ACTIONS'] = ','.join(
        copy_actions) if copy_actions is not None else None
    log['NPROC'] = nproc
    printLog(log)

    # Set consensus building function
    in_type = getFileType(seq_file)
    if in_type == 'fastq':
        cons_func = qualityConsensus
        cons_args = {
            'min_qual': min_qual,
            'min_freq': min_freq,
            'dependent': dependent
        }
    elif in_type == 'fasta':
        cons_func = frequencyConsensus
        cons_args = {'min_freq': min_freq}
    else:
        printError('Input file must be FASTA or FASTQ.')

    # Define feeder function and arguments
    index_args = {'field': barcode_field, 'delimiter': out_args['delimiter']}
    feed_func = feedSeqQueue
    feed_args = {
        'seq_file': seq_file,
        'index_func': indexSeqSets,
        'index_args': index_args
    }
    # Define worker function and arguments
    work_func = processQueue
    work_args = {
        'cons_func': cons_func,
        'cons_args': cons_args,
        'min_count': min_count,
        'primer_field': primer_field,
        'primer_freq': primer_freq,
        'max_gap': max_gap,
        'max_error': max_error,
        'max_diversity': max_diversity,
        'copy_fields': copy_fields,
        'copy_actions': copy_actions,
        'delimiter': out_args['delimiter']
    }
    # Define collector function and arguments
    collect_func = collectSeqQueue
    collect_args = {
        'seq_file': seq_file,
        'label': 'consensus',
        'out_file': out_file,
        'out_args': out_args,
        'index_field': barcode_field
    }

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, feed_args,
                             work_args, collect_args, nproc, queue_size)

    # Print log
    result['log']['END'] = 'BuildConsensus'
    printLog(result['log'])

    return result['out_files']
Esempio n. 7
0
def defineClones(db_file,
                 feed_func,
                 work_func,
                 collect_func,
                 clone_func,
                 cluster_func=None,
                 group_func=None,
                 group_args={},
                 clone_args={},
                 cluster_args={},
                 max_missing=default_max_missing,
                 out_args=default_out_args,
                 nproc=None,
                 queue_size=None):
    """
    Define clonally related sequences
    
    Arguments:
    db_file = filename of input database
    feed_func = the function that feeds the queue
    work_func = the worker function that will run on each CPU
    collect_func = the function that collects results from the workers
    group_func = the function to use for assigning preclones
    clone_func = the function to use for determining clones within preclonal groups
    group_args = a dictionary of arguments to pass to group_func
    clone_args = a dictionary of arguments to pass to clone_func
    max_missing : maximum number of non-ACGT characters to allow in the junction sequence.
    out_args = common output argument dictionary from parseCommonArgs
    nproc = the number of processQueue processes;
            if None defaults to the number of CPUs
    queue_size = maximum size of the argument queue;
                 if None defaults to 2*nproc    
    
    Returns:
    a list of successful output file names
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'DefineClones'
    log['DB_FILE'] = os.path.basename(db_file)
    log['MAX_MISSING'] = max_missing
    if group_func is not None:
        log['GROUP_FUNC'] = group_func.__name__
        log['GROUP_ARGS'] = group_args
    log['CLONE_FUNC'] = clone_func.__name__

    # TODO:  this is yucky, but can be fixed by using a model class
    clone_log = clone_args.copy()
    if 'dist_mat' in clone_log: del clone_log['dist_mat']
    log['CLONE_ARGS'] = clone_log

    if cluster_func is not None:
        log['CLUSTER_FUNC'] = cluster_func.__name__
        log['CLUSTER_ARGS'] = cluster_args
    log['NPROC'] = nproc
    printLog(log)

    # Define feeder function and arguments
    feed_args = {
        'db_file': db_file,
        'group_func': group_func,
        'group_args': group_args
    }
    # Define worker function and arguments
    work_args = {
        'max_missing': max_missing,
        'clone_func': clone_func,
        'clone_args': clone_args
    }
    # Define collector function and arguments
    collect_args = {
        'db_file': db_file,
        'out_args': out_args,
        'cluster_func': cluster_func,
        'cluster_args': cluster_args
    }

    # Call process manager
    result = manageProcesses(feed_func=feed_func,
                             work_func=work_func,
                             collect_func=collect_func,
                             feed_args=feed_args,
                             work_args=work_args,
                             collect_args=collect_args,
                             nproc=nproc,
                             queue_size=queue_size)

    # Print log
    result['log']['END'] = 'DefineClones'
    printLog(result['log'])

    return result['out_files']
def alignRecords(db_file,
                 seq_fields,
                 group_func,
                 align_func,
                 group_args={},
                 align_args={},
                 format='changeo',
                 out_file=None,
                 out_args=default_out_args,
                 nproc=None,
                 queue_size=None):
    """
    Performs a multiple alignment on sets of sequences

    Arguments: 
      db_file : filename of the input database.
      seq_fields : the sequence fields to multiple align.
      group_func : function to use to group records.
      align_func : function to use to multiple align sequence groups.
      group_args : dictionary of arguments to pass to group_func.
      align_args : dictionary of arguments to pass to align_func.
      format : output format. One of 'changeo' or 'airr'.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
      nproc : the number of processQueue processes.
              if None defaults to the number of CPUs.
      queue_size : maximum size of the argument queue.
                   if None defaults to 2*nproc.
                      
    Returns: 
      dict : names of the 'pass' and 'fail' output files.
    """
    # Define subcommand label dictionary
    cmd_dict = {
        alignAcross: 'across',
        alignWithin: 'within',
        alignBlocks: 'block'
    }

    # Print parameter info
    log = OrderedDict()
    log['START'] = 'AlignRecords'
    log['COMMAND'] = cmd_dict.get(align_func, align_func.__name__)
    log['FILE'] = os.path.basename(db_file)
    log['SEQ_FIELDS'] = ','.join(seq_fields)
    if 'group_fields' in group_args:
        log['GROUP_FIELDS'] = ','.join(group_args['group_fields'])
    if 'mode' in group_args: log['MODE'] = group_args['mode']
    if 'action' in group_args: log['ACTION'] = group_args['action']
    log['NPROC'] = nproc
    printLog(log)

    # Define format operators
    try:
        reader, writer, schema = getFormatOperators(format)
    except ValueError:
        printError('Invalid format %s.' % format)

    # Define feeder function and arguments
    if 'group_fields' in group_args and group_args['group_fields'] is not None:
        group_args['group_fields'] = [
            schema.toReceptor(f) for f in group_args['group_fields']
        ]
    feed_func = feedDbQueue
    feed_args = {
        'db_file': db_file,
        'reader': reader,
        'group_func': group_func,
        'group_args': group_args
    }
    # Define worker function and arguments
    field_map = OrderedDict([(schema.toReceptor(f), '%s_align' % f)
                             for f in seq_fields])
    align_args['field_map'] = field_map
    work_func = processDbQueue
    work_args = {'process_func': align_func, 'process_args': align_args}
    # Define collector function and arguments
    out_fields = getDbFields(db_file,
                             add=list(field_map.values()),
                             reader=reader)
    out_args['out_type'] = schema.out_type
    collect_func = collectDbQueue
    collect_args = {
        'db_file': db_file,
        'label': 'align',
        'fields': out_fields,
        'writer': writer,
        'out_file': out_file,
        'out_args': out_args
    }

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, feed_args,
                             work_args, collect_args, nproc, queue_size)

    # Print log
    result['log']['END'] = 'AlignRecords'
    printLog(result['log'])
    output = {k: v for k, v in result.items() if k in ('pass', 'fail')}

    return output
def assemblePairs(head_file, tail_file, assemble_func, assemble_args={}, 
                  coord_type=default_coord_type, rc=None, 
                  head_fields=None, tail_fields=None,  
                  out_args=default_out_args, nproc=None, queue_size=None):
    """
    Generates consensus sequences

    Arguments: 
    head_file = the head sequence file name
    tail_file = the tail sequence file name
    assemble_func = the function to use to assemble paired ends
    assemble_args = a dictionary of arguments to pass to the assembly function
    coord_type = the sequence header format
    rc = Defines which sequences ('head','tail','both') to reverse complement before assembly;
         if None do not reverse complement sequences
    head_fields = list of annotations in head_file records to copy to assembled record;
                  if None do not copy an annotation
    tail_fields = list of annotations in tail_file records to copy to assembled record;
                  if None do not copy an annotation
    out_args = common output argument dictionary from parseCommonArgs
    nproc = the number of processQueue processes;
            if None defaults to the number of CPUs
    queue_size = maximum size of the argument queue;
                 if None defaults to 2*nproc
                 
    Returns: 
    a list of successful output file names
    """
    # Define subcommand label dictionary
    cmd_dict = {alignAssembly:'align', joinSeqPair:'join', referenceAssembly:'reference'}

    # Print parameter info
    log = OrderedDict()
    log['START'] = 'AssemblePairs'
    log['COMMAND'] = cmd_dict.get(assemble_func, assemble_func.__name__)
    log['FILE1'] = os.path.basename(head_file) 
    log['FILE2'] = os.path.basename(tail_file)
    log['COORD_TYPE'] = coord_type
    if 'ref_file' in assemble_args:  log['REFFILE'] = assemble_args['ref_file']
    if 'alpha' in assemble_args:  log['ALPHA'] = assemble_args['alpha']
    if 'max_error' in assemble_args:  log['MAX_ERROR'] = assemble_args['max_error']
    if 'min_len' in assemble_args:  log['MIN_LEN'] = assemble_args['min_len']
    if 'max_len' in assemble_args:  log['MAX_LEN'] = assemble_args['max_len']
    if 'scan_reverse' in assemble_args:  log['SCAN_REVERSE'] = assemble_args['scan_reverse']
    if 'gap' in assemble_args:  log['GAP'] = assemble_args['gap']
    if 'min_ident' in assemble_args:  log['MIN_IDENT'] = assemble_args['min_ident']
    if 'evalue' in assemble_args:  log['EVALUE'] = assemble_args['evalue']
    if 'max_hits' in assemble_args:  log['MAX_HITS'] = assemble_args['max_hits']
    if 'fill' in assemble_args:  log['FILL'] = assemble_args['fill']
    log['NPROC'] = nproc
    printLog(log)

    # Count input files
    head_count = countSeqFile(head_file)
    tail_count = countSeqFile(tail_file)
    if head_count != tail_count:
        sys.exit('Error: FILE1 (n=%i) and FILE2 (n=%i) must have the same number of records' \
                 % (head_count, tail_count))

    # Define feeder function and arguments
    feed_func = feedPairQueue
    # feed_args = {'seq_file_1': head_file,
    #              'seq_file_2': tail_file,
    #              'index_dict': index_dict}
    feed_args = {'seq_file_1': head_file,
                 'seq_file_2': tail_file,
                 'coord_type': coord_type,
                 'delimiter': out_args['delimiter']}
    # Define worker function and arguments
    process_args = {'assemble_func': assemble_func,
                    'assemble_args': assemble_args,
                    'rc': rc,
                    'fields_1': head_fields,
                    'fields_2': tail_fields,
                    'delimiter': out_args['delimiter']}
    work_func = processSeqQueue
    work_args = {'process_func': processAssembly,
                 'process_args': process_args}
    # Define collector function and arguments
    collect_func = collectPairQueue
    # collect_args = {'result_count': pair_count,
    #                 'seq_file_1': head_file,
    #                 'seq_file_2': tail_file,
    #                 'out_args': out_args}
    collect_args = {'result_count': head_count,
                    'seq_file_1': head_file,
                    'seq_file_2': tail_file,
                    'out_args': out_args}

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, 
                             feed_args, work_args, collect_args, 
                             nproc, queue_size)
        
    # Print log
    log = OrderedDict()
    log['OUTPUT'] = result['log'].pop('OUTPUT')
    for k, v in result['log'].items():  log[k] = v
    log['END'] = 'AssemblePairs'
    printLog(log)
    
    return result['out_files']
Esempio n. 10
0
def unifyHeaders(seq_file,
                 collapse_func,
                 set_field=default_barcode_field,
                 unify_field=default_unify_field,
                 out_file=None,
                 out_args=default_out_args,
                 nproc=None,
                 queue_size=None):
    """
    Merges and filters annotation fields within groups

    Arguments:
      seq_file : the sample sequence file name.
      collapse_func : the function to use for collapsing annotations.
      set_field : the annotation containing set IDs.
      unify_field : the field for collection criteria.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
      nproc : the number of processQueue processes;
              if None defaults to the number of CPUs.
      queue_size : maximum size of the argument queue;
                   if None defaults to 2*nproc.

    Returns:
      str: output file name.
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'UnifyHeaders'
    log['FILE'] = os.path.basename(seq_file)
    log['SET_FIELD'] = set_field
    log['UNIFY_FIELD'] = unify_field
    log['NPROC'] = nproc
    printLog(log)

    # Define feeder function and arguments
    index_args = {'field': set_field, 'delimiter': out_args['delimiter']}
    feed_func = feedSeqQueue
    feed_args = {
        'seq_file': seq_file,
        'index_func': indexSeqSets,
        'index_args': index_args
    }
    # Define worker function and arguments
    collapse_args = {'field': unify_field, 'delimiter': out_args['delimiter']}
    work_func = processSeqQueue
    work_args = {'process_func': collapse_func, 'process_args': collapse_args}
    # Define collector function and arguments
    collect_func = collectSeqQueue
    collect_args = {
        'seq_file': seq_file,
        'label': 'unify',
        'out_file': out_file,
        'out_args': out_args,
        'index_field': set_field
    }

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, feed_args,
                             work_args, collect_args, nproc, queue_size)

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = result['log'].pop('OUTPUT')
    for k, v in result['log'].items():
        log[k] = v
    log['END'] = 'UnifyHeaders'
    printLog(log)

    return result['out_files']
def estimateError(seq_file, cons_func=frequencyConsensus, cons_args={}, 
                  set_field=default_barcode_field, min_count=default_min_count, max_diversity=None, 
                  out_args=default_out_args, nproc=None, queue_size=None):
    """
    Calculates error rates of sequence sets

    Arguments: 
    seq_file = the sample sequence file name
    cons_func = the function to use for consensus generation 
    cons_args = a dictionary of arguments for the consensus function
    set_field = the annotation field containing set IDs
    min_count = threshold number of sequences to consider a set
    max_diversity = a threshold defining the average pairwise error rate required to retain a read group;
                    if None do not calculate diversity
    out_args = common output argument dictionary from parseCommonArgs
    nproc = the number of processQueue processes;
            if None defaults to the number of CPUs
    queue_size = maximum size of the argument queue;
                 if None defaults to 2*nproc
                    
    Returns: 
    a list of tuples of (position error, quality error, nucleotide pairwise error) output file names
    """
    # Define subcommand label dictionary
    cmd_dict = {frequencyConsensus:'freq', qualityConsensus:'qual'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'EstimateError'
    log['FILE'] = os.path.basename(seq_file)
    log['MODE'] = cmd_dict.get(cons_func, cons_func.__name__)
    log['SET_FIELD'] = set_field
    log['MIN_COUNT'] = min_count
    log['MAX_DIVERSITY'] = max_diversity
    log['NPROC'] = nproc
    printLog(log)
    
    # Check input file type
    in_type = getFileType(seq_file)
    if in_type != 'fastq':  sys.exit('ERROR:  Input file must be FASTQ')
    
        # Define feeder function and arguments
    index_args = {'field': set_field, 'delimiter': out_args['delimiter']}
    feed_func = feedSeqQueue
    feed_args = {'seq_file': seq_file,
                 'index_func': indexSeqSets, 
                 'index_args': index_args}
    # Define worker function and arguments
    work_func = processEEQueue
    work_args = {'cons_func': cons_func, 
                 'cons_args': cons_args,
                 'min_count': min_count,
                 'max_diversity': max_diversity}
    # Define collector function and arguments
    collect_func = collectEEQueue
    collect_args = {'seq_file': seq_file,
                    'out_args': out_args,
                    'set_field': set_field}

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, 
                             feed_args, work_args, collect_args, 
                             nproc, queue_size)
        
    # Print log
    result['log']['END'] = 'EstimateError'
    printLog(result['log'])
        
    return result['out_files']
Esempio n. 12
0
def assemblePairs(head_file,
                  tail_file,
                  assemble_func,
                  assemble_args={},
                  coord_type=default_coord,
                  rc='tail',
                  head_fields=None,
                  tail_fields=None,
                  out_file=None,
                  out_args=default_out_args,
                  nproc=None,
                  queue_size=None):
    """
    Generates consensus sequences

    Arguments: 
      head_file : the head sequence file name
      tail_file : the tail sequence file name
      assemble_func : the function to use to assemble paired ends
      assemble_args : a dictionary of arguments to pass to the assembly function
      coord_type : the sequence header format
      rc : Defines which sequences ('head', 'tail', 'both', 'none') to reverse complement before assembly;
           if 'none' do not reverse complement sequences
      head_fields : list of annotations in head_file records to copy to assembled record;
                    if None do not copy an annotation
      tail_fields : list of annotations in tail_file records to copy to assembled record;
                    if None do not copy an annotation
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs
      nproc = the number of processQueue processes;
              if None defaults to the number of CPUs
      queue_size = maximum size of the argument queue;
                   if None defaults to 2*nproc
                 
    Returns: 
      list: a list of successful output file names.
    """
    # Define subcommand label dictionary
    cmd_dict = {
        alignAssembly: 'align',
        joinAssembly: 'join',
        referenceAssembly: 'reference',
        sequentialAssembly: 'sequential'
    }
    cmd_name = cmd_dict.get(assemble_func, assemble_func.__name__)

    # Print parameter info
    log = OrderedDict()
    log['START'] = 'AssemblePairs'
    log['COMMAND'] = cmd_name
    log['FILE1'] = os.path.basename(head_file)
    log['FILE2'] = os.path.basename(tail_file)
    log['COORD_TYPE'] = coord_type
    if 'ref_file' in assemble_args: log['REFFILE'] = assemble_args['ref_file']
    if 'alpha' in assemble_args: log['ALPHA'] = assemble_args['alpha']
    if 'max_error' in assemble_args:
        log['MAX_ERROR'] = assemble_args['max_error']
    if 'min_len' in assemble_args: log['MIN_LEN'] = assemble_args['min_len']
    if 'max_len' in assemble_args: log['MAX_LEN'] = assemble_args['max_len']
    if 'scan_reverse' in assemble_args:
        log['SCAN_REVERSE'] = assemble_args['scan_reverse']
    if 'gap' in assemble_args: log['GAP'] = assemble_args['gap']
    if 'min_ident' in assemble_args:
        log['MIN_IDENT'] = assemble_args['min_ident']
    if 'evalue' in assemble_args: log['EVALUE'] = assemble_args['evalue']
    if 'max_hits' in assemble_args: log['MAX_HITS'] = assemble_args['max_hits']
    if 'fill' in assemble_args: log['FILL'] = assemble_args['fill']
    if 'aligner' in assemble_args: log['ALIGNER'] = assemble_args['aligner']
    log['NPROC'] = nproc
    printLog(log)

    # Count input files
    head_count = countSeqFile(head_file)
    tail_count = countSeqFile(tail_file)
    if head_count != tail_count:
        printError('FILE1 (n=%i) and FILE2 (n=%i) must have the same number of records.' \
                 % (head_count, tail_count))

    # Setup for reference alignment
    if cmd_name in ('reference', 'sequential'):
        ref_file = assemble_args.pop('ref_file')
        db_exec = assemble_args.pop('db_exec')

        # Build reference sequence dictionary
        assemble_args['ref_dict'] = readReferenceFile(ref_file)

        # Build reference database files
        try:
            db_func = {
                'blastn': makeBlastnDb,
                'usearch': makeUBlastDb
            }[assemble_args['aligner']]
            ref_db, db_handle = db_func(ref_file, db_exec)
            assemble_args['ref_db'] = ref_db
        except:
            printError('Error building reference database for aligner %s with executable %s.' \
                       % (assemble_args['aligner'], db_exec))

    # Define feeder function and arguments
    feed_func = feedPairQueue
    feed_args = {
        'seq_file_1': head_file,
        'seq_file_2': tail_file,
        'coord_type': coord_type,
        'delimiter': out_args['delimiter']
    }
    # Define worker function and arguments
    process_args = {
        'assemble_func': assemble_func,
        'assemble_args': assemble_args,
        'rc': rc,
        'fields_1': head_fields,
        'fields_2': tail_fields,
        'delimiter': out_args['delimiter']
    }
    work_func = processSeqQueue
    work_args = {'process_func': assemblyWorker, 'process_args': process_args}
    # Define collector function and arguments
    collect_func = collectPairQueue
    collect_args = {
        'seq_file_1': head_file,
        'seq_file_2': tail_file,
        'label': 'assemble',
        'out_file': out_file,
        'out_args': out_args
    }

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, feed_args,
                             work_args, collect_args, nproc, queue_size)

    # Close reference database handle
    if cmd_name in ('reference', 'sequential'):
        try:
            db_handle.close()
        except AttributeError:
            db_handle.cleanup()
        except:
            printError('Cannot close reference database file.')

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = result['log'].pop('OUTPUT')
    for k, v in result['log'].items():
        log[k] = v
    log['END'] = 'AssemblePairs'
    printLog(log)

    return result['out_files']
def maskPrimers(seq_file, primer_file, mode, align_func, align_args={}, 
                max_error=default_max_error, barcode=False,
                out_args=default_out_args, nproc=None, queue_size=None):
    """
    Masks or cuts primers from sample sequences using local alignment

    Arguments: 
    seq_file = name of file containing sample sequences
    primer_file = name of the file containing primer sequences
    mode = defines the action taken; one of 'cut','mask','tag'
    align_func = the function to call for alignment
    align_arcs = a dictionary of arguments to pass to align_func
    max_error = maximum acceptable error rate for a valid alignment
    barcode = if True add sequence preceding primer to description
    out_args = common output argument dictionary from parseCommonArgs
    nproc = the number of processQueue processes;
            if None defaults to the number of CPUs
    queue_size = maximum size of the argument queue;
                 if None defaults to 2*nproc
                 
    Returns:
    a list of successful output file names
    """
    # Define subcommand label dictionary
    cmd_dict = {alignPrimers:'align', scorePrimers:'score'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'MaskPrimers'
    log['COMMAND'] = cmd_dict.get(align_func, align_func.__name__)
    log['SEQ_FILE'] = os.path.basename(seq_file)
    log['PRIMER_FILE'] = os.path.basename(primer_file)
    log['MODE'] = mode
    log['BARCODE'] = barcode
    log['MAX_ERROR'] = max_error
    if 'start' in align_args: log['START_POS'] = align_args['start']
    if 'max_len' in align_args: log['MAX_LEN'] = align_args['max_len']
    if 'rev_primer' in align_args: log['REV_PRIMER'] = align_args['rev_primer']
    if 'skip_rc' in align_args: log['SKIP_RC'] = align_args['skip_rc']
    if 'gap_penalty' in align_args:
        log['GAP_PENALTY'] = ', '.join([str(x) for x in align_args['gap_penalty']])
    log['NPROC'] = nproc
    printLog(log)

    # Create dictionary of primer sequences to pass to maskPrimers
    primers = readPrimerFile(primer_file)
    if 'rev_primer' in align_args and align_args['rev_primer']:
        primers = {k: reverseComplement(v) for k, v in primers.items()}

    # Define alignment arguments and compile primers for align mode
    align_args['primers'] = primers 
    align_args['score_dict'] = getDNAScoreDict(mask_score=(0, 1), gap_score=(0, 0))
    if align_func is alignPrimers:
        align_args['max_error'] = max_error
        align_args['primers_regex'] = compilePrimers(primers)
    
    # Define sequence masking arguments
    mask_args = {'mode': mode, 
                 'barcode': barcode, 
                 'delimiter': out_args['delimiter']}

    # Define feeder function and arguments
    feed_func = feedSeqQueue
    feed_args = {'seq_file': seq_file}
    # Define worker function and arguments
    work_func = processMPQueue
    work_args = {'align_func': align_func, 
                 'align_args': align_args,
                 'mask_args': mask_args,
                 'max_error': max_error}
    
    # Define collector function and arguments
    collect_func = collectSeqQueue
    collect_args = {'seq_file': seq_file,
                    'task_label': 'primers',
                    'out_args': out_args}
    
    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, 
                             feed_args, work_args, collect_args, 
                             nproc, queue_size)

    # Print log
    result['log']['END'] = 'MaskPrimers'
    printLog(result['log'])
        
    return result['out_files']
Esempio n. 14
0
def alignSets(seq_file, align_func, align_args, barcode_field=default_barcode_field,
              calc_div=False, out_args=default_out_args, nproc=None, queue_size=None):
    """
    Performs a multiple alignment on sets of sequences

    Arguments: 
    seq_file = the sample sequence file name
    align_func = the function to use to align sequence sets
    align_args = a dictionary of arguments to pass to align_func
    barcode_field = the annotation containing set IDs
    calc_div = if True calculate average pairwise error for each sequence set
    out_args = common output argument dictionary from parseCommonArgs
    nproc = the number of processQueue processes;
            if None defaults to the number of CPUs
    queue_size = maximum size of the argument queue;
                 if None defaults to 2*nproc
                      
    Returns: 
    a tuple of (valid_file, invalid_file) names
    """
    # Define subcommand label dictionary
    cmd_dict = {runMuscle:'align', offsetSeqSet:'offset'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'AlignSets'
    log['COMMAND'] = cmd_dict[align_func]
    log['FILE'] = os.path.basename(seq_file)
    if 'mode' in align_args: log['MODE'] = align_args['mode']
    log['BARCODE_FIELD'] = barcode_field
    if 'field' in align_args: log['OFFSET_FIELD'] = align_args['field']
    log['CALC_DIV'] = calc_div
    log['NPROC'] = nproc
    printLog(log)
 
    # Define feeder function and arguments
    index_args = {'field': barcode_field, 'delimiter': out_args['delimiter']}
    feed_func = feedSeqQueue
    feed_args = {'seq_file': seq_file,
                 'index_func': indexSeqSets, 
                 'index_args': index_args}
    # Define worker function and arguments
    work_func = processASQueue
    work_args = {'align_func': align_func, 
                 'align_args': align_args,
                 'calc_div': calc_div,
                 'delimiter': out_args['delimiter']}
    # Define collector function and arguments
    collect_func = collectSeqQueue
    collect_args = {'seq_file': seq_file,
                    'task_label': 'align',
                    'out_args': out_args,
                    'index_field': barcode_field}
    
    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, 
                             feed_args, work_args, collect_args, 
                             nproc, queue_size)
        
    # Print log
    result['log']['END'] = 'AlignSets'
    printLog(result['log'])
        
    return result['out_files']
Esempio n. 15
0
def clusterSets(seq_file,
                ident=default_cluster_ident,
                length_ratio=default_length_ratio,
                seq_start=0,
                seq_end=None,
                set_field=default_barcode_field,
                cluster_field=default_cluster_field,
                cluster_prefix=default_cluster_prefix,
                cluster_tool=default_cluster_tool,
                cluster_exec=default_cluster_exec,
                out_file=None,
                out_args=default_out_args,
                nproc=None,
                queue_size=None):
    """
    Performs clustering on sets of sequences

    Arguments:
      seq_file : the sample sequence file name.
      ident : the identity threshold for clustering sequences.
      length_ratio : minimum short/long length ratio allowed within a cluster.
      seq_start : the start position to trim sequences at before clustering.
      seq_end : the end position to trim sequences at before clustering.
      set_field : the annotation containing set IDs.
      cluster_field : the name of the output cluster field.
      cluster_prefix : string defining a prefix for the cluster identifier.
      cluster_exec : the path to the clustering executable.
      cluster_tool : the clustering tool to use; one of cd-hit or usearch.
            out_file : output file name. Automatically generated from the input file if None.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
      nproc : the number of processQueue processes;
              if None defaults to the number of CPUs.
      queue_size : maximum size of the argument queue;
                   if None defaults to 2*nproc.

    Returns:
      str: the clustered output file name.
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'ClusterSets'
    log['COMMAND'] = 'set'
    log['FILE'] = os.path.basename(seq_file)
    log['IDENTITY'] = ident
    log['SEQUENCE_START'] = seq_start
    log['SEQUENCE_END'] = seq_end
    log['SET_FIELD'] = set_field
    log['CLUSTER_FIELD'] = cluster_field
    log['CLUSTER_PREFIX'] = cluster_prefix
    log['CLUSTER_TOOL'] = cluster_tool
    log['NPROC'] = nproc
    printLog(log)

    # Set cluster tool
    try:
        cluster_func = map_cluster_tool.get(cluster_tool)
    except:
        printError('Invalid clustering tool %s.' % cluster_tool)

    # Check the minimum identity
    if ident < min_cluster_ident[cluster_tool]:
        printError('Minimum identity %s too low for clustering tool %s.' %
                   (str(ident), cluster_tool))

    # Define cluster function parameters
    cluster_args = {
        'cluster_exec': cluster_exec,
        'ident': ident,
        'length_ratio': length_ratio,
        'seq_start': seq_start,
        'seq_end': seq_end
    }

    # Define feeder function and arguments
    index_args = {'field': set_field, 'delimiter': out_args['delimiter']}
    feed_func = feedSeqQueue
    feed_args = {
        'seq_file': seq_file,
        'index_func': indexSeqSets,
        'index_args': index_args
    }
    # Define worker function and arguments
    work_func = processQueue
    work_args = {
        'cluster_func': cluster_func,
        'cluster_args': cluster_args,
        'cluster_field': cluster_field,
        'cluster_prefix': cluster_prefix,
        'delimiter': out_args['delimiter']
    }
    # Define collector function and arguments
    collect_func = collectSeqQueue
    collect_args = {
        'seq_file': seq_file,
        'label': 'cluster',
        'out_file': out_file,
        'out_args': out_args,
        'index_field': set_field
    }

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, feed_args,
                             work_args, collect_args, nproc, queue_size)

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = result['log'].pop('OUTPUT')
    for k, v in result['log'].items():
        log[k] = v
    log['END'] = 'ClusterSets'
    printLog(log)

    return result['out_files']
def buildConsensus(seq_file, barcode_field=default_barcode_field, 
                   min_count=default_min_count, min_freq=default_min_freq,
                   min_qual=default_min_qual, primer_field=None, primer_freq=None,
                   max_gap=None, max_error=None, max_diversity=None,
                   copy_fields=None, copy_actions=None, dependent=False,
                   out_args=default_out_args, nproc=None, queue_size=None):
    """
    Generates consensus sequences

    Arguments: 
    seq_file = the sample sequence file name
    barcode_field = the annotation field containing set IDs
    min_count = threshold number of sequences to define a consensus 
    min_freq = the frequency cutoff to assign a base 
    min_qual = the quality cutoff to assign a base
    primer_field = the annotation field containing primer tags;
                   if None do not annotate with primer tags
    primer_freq = the maximum primer tag frequency that must be meet to build a consensus;
                  if None do not filter by primer frequency
    max_gap = the maximum frequency of (., -) characters allowed before
               deleting a position; if None do not delete positions
    max_error = a threshold defining the maximum allowed error rate to retain a read group;
                if None do not calculate error rate
    max_diversity = a threshold defining the average pairwise error rate required to retain a read group;
                    if None do not calculate diversity
    dependent = if False treat barcode group sequences as independent data
    copy_fields = a list of annotations to copy into consensus sequence annotations;
                  if None no additional annotations will be copied
    copy_actions = the list of actions to take for each copy_fields;
                   one of ['set', 'majority', 'min', 'max', 'sum']
    out_args = common output argument dictionary from parseCommonArgs
    nproc = the number of processQueue processes;
            if None defaults to the number of CPUs
    queue_size = maximum size of the argument queue;
                 if None defaults to 2*nproc
                    
    Returns: 
    a list of successful output file names
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'BuildConsensus'
    log['FILE'] = os.path.basename(seq_file)
    log['BARCODE_FIELD'] = barcode_field
    log['MIN_COUNT'] = min_count
    log['MIN_FREQUENCY'] = min_freq
    log['MIN_QUALITY'] = min_qual
    log['MAX_GAP'] = max_gap
    log['PRIMER_FIELD'] = primer_field
    log['PRIMER_FREQUENCY'] = primer_freq
    log['MAX_ERROR'] = max_error
    log['MAX_DIVERSITY'] = max_diversity
    log['DEPENDENT'] = dependent
    log['COPY_FIELDS'] = ','.join(copy_fields) if copy_fields is not None else None
    log['COPY_ACTIONS'] = ','.join(copy_actions) if copy_actions is not None else None
    log['NPROC'] = nproc
    printLog(log)
    
    # Set consensus building function
    in_type = getFileType(seq_file)
    if in_type == 'fastq':
        cons_func = qualityConsensus
        cons_args = {'min_qual': min_qual, 
                     'min_freq': min_freq,
                     'dependent': dependent}
    elif in_type == 'fasta':  
        cons_func = frequencyConsensus
        cons_args = {'min_freq': min_freq}
    else:
        sys.exit('ERROR:  Input file must be FASTA or FASTQ')
    
    # Define feeder function and arguments
    index_args = {'field': barcode_field, 'delimiter': out_args['delimiter']}
    feed_func = feedSeqQueue
    feed_args = {'seq_file': seq_file,
                 'index_func': indexSeqSets, 
                 'index_args': index_args}
    # Define worker function and arguments
    work_func = processBCQueue
    work_args = {'cons_func': cons_func, 
                 'cons_args': cons_args,
                 'min_count': min_count,
                 'primer_field': primer_field,
                 'primer_freq': primer_freq,
                 'max_gap': max_gap,
                 'max_error': max_error,
                 'max_diversity': max_diversity,
                 'copy_fields': copy_fields,
                 'copy_actions': copy_actions,
                 'delimiter': out_args['delimiter']}
    # Define collector function and arguments
    collect_func = collectSeqQueue
    collect_args = {'seq_file': seq_file,
                    'task_label': 'consensus',
                    'out_args': out_args,
                    'index_field': barcode_field}
    
    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, 
                             feed_args, work_args, collect_args, 
                             nproc, queue_size)
        
    # Print log
    result['log']['END'] = 'BuildConsensus'
    printLog(result['log'])
        
    return result['out_files']
Esempio n. 17
0
def maskPrimers(seq_file,
                primer_file,
                align_func,
                align_args={},
                out_file=None,
                out_args=default_out_args,
                nproc=None,
                queue_size=None):
    """
    Masks or cuts primers from sample sequences using local alignment

    Arguments: 
      seq_file : name of file containing sample sequences.
      primer_file : name of the file containing primer sequences.
      align_func : the function to call for alignment.
      align_arcs : a dictionary of arguments to pass to align_func.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
      nproc : the number of processQueue processes;
              if None defaults to the number of CPUs.
      queue_size : maximum size of the argument queue;
                   if None defaults to 2*nproc.
                 
    Returns:
      list: a list of successful output file names.
    """
    # Define subcommand label dictionary
    cmd_dict = {
        alignPrimers: 'align',
        scorePrimers: 'score',
        extractPrimers: 'extract'
    }

    # Print parameter info
    log = OrderedDict()
    log['START'] = 'MaskPrimers'
    log['COMMAND'] = cmd_dict.get(align_func, align_func.__name__)
    log['SEQ_FILE'] = os.path.basename(seq_file)
    if primer_file is not None:
        log['PRIMER_FILE'] = os.path.basename(primer_file)
    if 'mode' in align_args: log['MODE'] = align_args['mode']
    if 'max_error' in align_args: log['MAX_ERROR'] = align_args['max_error']
    if 'start' in align_args: log['START_POS'] = align_args['start']
    if 'length' in align_args: log['LENGTH'] = align_args['length']
    if 'max_len' in align_args: log['MAX_LEN'] = align_args['max_len']
    if 'rev_primer' in align_args: log['REV_PRIMER'] = align_args['rev_primer']
    if 'skip_rc' in align_args: log['SKIP_RC'] = align_args['skip_rc']
    if 'gap_penalty' in align_args:
        log['GAP_PENALTY'] = ', '.join(
            [str(x) for x in align_args['gap_penalty']])
    if 'barcode' in align_args:
        log['BARCODE'] = align_args['barcode']
    if 'barcode' in align_args and align_args['barcode']:
        log['BARCODE_FIELD'] = align_args['barcode_field']
    log['PRIMER_FIELD'] = align_args['primer_field']
    log['NPROC'] = nproc
    printLog(log)

    # Define alignment arguments and compile primers for align mode
    if primer_file is not None:
        primers = readPrimerFile(primer_file)
        if 'rev_primer' in align_args and align_args['rev_primer']:
            primers = {k: reverseComplement(v) for k, v in primers.items()}
        align_args['primers'] = primers
        align_args['score_dict'] = getDNAScoreDict(mask_score=(0, 1),
                                                   gap_score=(0, 0))
    if align_func is alignPrimers:
        align_args['primers_regex'] = compilePrimers(primers)
    align_args['delimiter'] = out_args['delimiter']

    # Define feeder function and arguments
    feed_func = feedSeqQueue
    feed_args = {'seq_file': seq_file}
    # Define worker function and arguments
    work_func = processSeqQueue
    work_args = {'process_func': align_func, 'process_args': align_args}
    # Define collector function and arguments
    collect_func = collectSeqQueue
    collect_args = {
        'seq_file': seq_file,
        'label': 'primers',
        'out_file': out_file,
        'out_args': out_args
    }

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, feed_args,
                             work_args, collect_args, nproc, queue_size)

    # Print log
    result['log']['END'] = 'MaskPrimers'
    printLog(result['log'])

    return result['out_files']
Esempio n. 18
0
def estimateSets(seq_file,
                 cons_func=frequencyConsensus,
                 cons_args={},
                 set_field=default_barcode_field,
                 min_count=default_min_count,
                 max_diversity=None,
                 out_args=default_out_args,
                 nproc=None,
                 queue_size=None):
    """
    Calculates error rates of sequence sets

    Arguments: 
      seq_file : the sample sequence file name
      cons_func : the function to use for consensus generation 
      cons_args : a dictionary of arguments for the consensus function
      set_field : the annotation field containing set IDs
      min_count : threshold number of sequences to consider a set
      max_diversity : a threshold defining the average pairwise error rate required to retain a read group;
                    if None do not calculate diversity
      out_args : common output argument dictionary from parseCommonArgs
      nproc : the number of processQueue processes;
            if None defaults to the number of CPUs
      queue_size : maximum size of the argument queue;
                 if None defaults to 2*nproc
                    
    Returns: 
      tuple : (position error, quality error, nucleotide pairwise error) output file names
    """
    # Define subcommand label dictionary
    cmd_dict = {frequencyConsensus: 'freq', qualityConsensus: 'qual'}

    # Print parameter info
    log = OrderedDict()
    log['START'] = 'EstimateError'
    log['FILE'] = os.path.basename(seq_file)
    log['MODE'] = cmd_dict.get(cons_func, cons_func.__name__)
    log['SET_FIELD'] = set_field
    log['MIN_COUNT'] = min_count
    log['MAX_DIVERSITY'] = max_diversity
    log['NPROC'] = nproc
    printLog(log)

    # Check input file type
    in_type = getFileType(seq_file)
    if in_type != 'fastq':
        printError('Input file must be FASTQ.')

    # Define feeder function and arguments
    index_args = {'field': set_field, 'delimiter': out_args['delimiter']}
    feed_func = feedSeqQueue
    feed_args = {
        'seq_file': seq_file,
        'index_func': indexSeqSets,
        'index_args': index_args
    }
    # Define worker function and arguments
    work_func = processEEQueue
    work_args = {
        'cons_func': cons_func,
        'cons_args': cons_args,
        'min_count': min_count,
        'max_diversity': max_diversity
    }
    # Define collector function and arguments
    collect_func = collectEEQueue
    collect_args = {
        'seq_file': seq_file,
        'out_args': out_args,
        'set_field': set_field
    }

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, feed_args,
                             work_args, collect_args, nproc, queue_size)

    # Print log
    result['log']['END'] = 'EstimateError'
    printLog(result['log'])

    return result['out_files']
def defineClones(db_file, feed_func, work_func, collect_func, clone_func, cluster_func=None,
                 group_func=None, group_args={}, clone_args={}, cluster_args={}, 
                 out_args=default_out_args, nproc=None, queue_size=None):
    """
    Define clonally related sequences
    
    Arguments:
    db_file = filename of input database
    feed_func = the function that feeds the queue
    work_func = the worker function that will run on each CPU
    collect_func = the function that collects results from the workers
    group_func = the function to use for assigning preclones
    clone_func = the function to use for determining clones within preclonal groups
    group_args = a dictionary of arguments to pass to group_func
    clone_args = a dictionary of arguments to pass to clone_func
    out_args = common output argument dictionary from parseCommonArgs
    nproc = the number of processQueue processes;
            if None defaults to the number of CPUs
    queue_size = maximum size of the argument queue;
                 if None defaults to 2*nproc    
    
    Returns:
    a list of successful output file names
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'DefineClones'
    log['DB_FILE'] = os.path.basename(db_file)
    if group_func is not None:
        log['GROUP_FUNC'] = group_func.__name__
        log['GROUP_ARGS'] = group_args
    log['CLONE_FUNC'] = clone_func.__name__

    # TODO:  this is yucky, but can be fixed by using a model class
    clone_log = clone_args.copy()
    if 'dist_mat' in clone_log:  del clone_log['dist_mat']
    log['CLONE_ARGS'] = clone_log

    if cluster_func is not None:
        log['CLUSTER_FUNC'] = cluster_func.__name__
        log['CLUSTER_ARGS'] = cluster_args
    log['NPROC'] = nproc
    printLog(log)
    
    # Define feeder function and arguments
    feed_args = {'db_file': db_file,
                 'group_func': group_func, 
                 'group_args': group_args}
    # Define worker function and arguments
    work_args = {'clone_func': clone_func, 
                 'clone_args': clone_args}
    # Define collector function and arguments
    collect_args = {'db_file': db_file,
                    'out_args': out_args,
                    'cluster_func': cluster_func,
                    'cluster_args': cluster_args}
    
    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, 
                             feed_args, work_args, collect_args, 
                             nproc, queue_size)
        
    # Print log
    result['log']['END'] = 'DefineClones'
    printLog(result['log'])
    
    return result['out_files']
Esempio n. 20
0
def defineClones(db_file,
                 seq_field=default_junction_field,
                 v_field=default_v_field,
                 j_field=default_j_field,
                 max_missing=default_max_missing,
                 group_fields=None,
                 group_func=groupByGene,
                 group_args={},
                 clone_func=distanceClones,
                 clone_args={},
                 format=default_format,
                 out_file=None,
                 out_args=default_out_args,
                 nproc=None,
                 queue_size=None):
    """
    Define clonally related sequences
    
    Arguments:
      db_file : filename of input database.
      seq_field : sequence field used to determine clones.
      v_field : field containing the V call.
      j_field : field containing the J call.
      max_missing : maximum number of non-ACGT characters to allow in the junction sequence.
      group_fields : additional annotation fields to use to group preclones;
                     if None use only V and J.
      group_func : the function to use for assigning preclones.
      group_args : a dictionary of arguments to pass to group_func.
      clone_func : the function to use for determining clones within preclonal groups.
      clone_args : a dictionary of arguments to pass to clone_func.
      format : input and output format.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
      nproc : the number of processQueue processes;
              if None defaults to the number of CPUs.
      queue_size : maximum size of the argument queue;
                   if None defaults to 2*nproc.
    
    Returns:
      dict: dictionary of output pass and fail files.
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'DefineClones'
    log['FILE'] = os.path.basename(db_file)
    log['SEQ_FIELD'] = seq_field
    log['V_FIELD'] = v_field
    log['J_FIELD'] = j_field
    log['MAX_MISSING'] = max_missing
    log['GROUP_FIELDS'] = ','.join(
        group_fields) if group_fields is not None else None
    for k in sorted(group_args):
        log[k.upper()] = group_args[k]
    for k in sorted(clone_args):
        if k != 'dist_mat': log[k.upper()] = clone_args[k]
    log['NPROC'] = nproc
    printLog(log)

    # Define format operators
    try:
        reader, writer, schema = getFormatOperators(format)
    except ValueError:
        printError('Invalid format %s.' % format)

    # Translate to Receptor attribute names
    seq_field = schema.toReceptor(seq_field)
    v_field = schema.toReceptor(v_field)
    j_field = schema.toReceptor(j_field)
    if group_fields is not None:
        group_fields = [schema.toReceptor(f) for f in group_fields]

    # Define feeder function and arguments
    group_args['group_fields'] = group_fields
    group_args['v_field'] = v_field
    group_args['j_field'] = j_field
    feed_args = {
        'db_file': db_file,
        'reader': reader,
        'group_func': group_func,
        'group_args': group_args
    }

    # Define worker function and arguments
    filter_args = {
        'seq_field': seq_field,
        'v_field': v_field,
        'j_field': j_field,
        'max_missing': max_missing
    }
    clone_args['seq_field'] = seq_field
    work_args = {
        'process_func': clone_func,
        'process_args': clone_args,
        'filter_func': filterMissing,
        'filter_args': filter_args
    }

    # Define collector function and arguments
    out_fields = getDbFields(db_file,
                             add=schema.fromReceptor('clone'),
                             reader=reader)
    out_args['out_type'] = schema.out_type
    collect_args = {
        'db_file': db_file,
        'fields': out_fields,
        'writer': writer,
        'out_file': out_file,
        'out_args': out_args
    }

    # Check for required columns
    try:
        required = ['junction']
        checkFields(required, out_fields, schema=schema)
    except LookupError as e:
        printError(e)

    # Call process manager
    result = manageProcesses(feed_func=feedDbQueue,
                             work_func=processDbQueue,
                             collect_func=collectQueue,
                             feed_args=feed_args,
                             work_args=work_args,
                             collect_args=collect_args,
                             nproc=nproc,
                             queue_size=queue_size)

    # Print log
    result['log']['END'] = 'DefineClones'
    printLog(result['log'])
    output = {k: v for k, v in result.items() if k in ('pass', 'fail')}

    return output