def getRepo(repo):
    """
    Parses germline repositories

    Arguments:
      repo : String list of directories and/or files
             from which to read germline records

    Returns:
      dictionary : Dictionary of {allele: sequence} germlines
    """
    repo_files = []
    # Iterate over items passed to commandline
    for r in repo:
        # If directory, get fasta files from within
        if os.path.isdir(r):
            repo_files.extend([os.path.join(r, f) for f in os.listdir(r) \
                          if getFileType(f) == 'fasta'])
        # If file, make sure file is fasta
        if os.path.isfile(r) and getFileType(r) == 'fasta':
            repo_files.extend([r])

    # Catch instances where no valid fasta files were passed in
    if len(repo_files) < 1:
        sys.exit('ERROR: No valid germline fasta files were found in %s', repo)

    repo_dict = {}
    for file_name in repo_files:
        with open(file_name, "rU") as file_handle:
            germlines = SeqIO.parse(file_handle, "fasta")
            for g in germlines:
                germ_key = parseAllele(g.description, allele_regex, 'list')
                repo_dict[germ_key] = str(g.seq).upper() # @UndefinedVariable
    return repo_dict
Exemple #2
0
    def test_getFileType(self):
        result = getFileType(self.germ_file)
        print(result)
        self.assertEqual('fasta', result)

        result = getFileType(self.missing_file)
        print(result)
        self.assertEqual('fasta', result)
def filterSeq(seq_file, filter_func, filter_args={}, out_args=default_out_args, 
              nproc=None, queue_size=None):
    """
    Filters sequences by fraction of ambiguous nucleotides
    
    Arguments: 
    seq_file = the sequence file to filter
    filter_func = the function to use for filtering sequences
    filter_args = a dictionary of arguments to pass to filter_func
    out_args = common output argument dictionary from parseCommonArgs
    nproc = the number of processQueue processes;
            if None defaults to the number of CPUs
    queue_size = maximum size of the argument queue;
                 if None defaults to 2*nproc
                 
    Returns:
    a list of successful output file names
    """
    # Define output file label dictionary
    cmd_dict = {filterLength:'length', filterMissing:'missing', 
                filterRepeats:'repeats', filterQuality:'quality', 
                maskQuality:'maskqual', trimQuality:'trimqual'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'FilterSeq'
    log['COMMAND'] = cmd_dict.get(filter_func, filter_func.__name__)
    log['FILE'] = os.path.basename(seq_file)
    for k in sorted(filter_args):  log[k.upper()] = filter_args[k]
    log['NPROC'] = nproc
    printLog(log)
    
    # Check input type
    in_type = getFileType(seq_file)
    if in_type != 'fastq' and filter_func in (filterQuality, maskQuality, trimQuality):
        sys.exit('ERROR:  Input file must be FASTQ for %s mode' % cmd_dict[filter_func])
    
    # Define feeder function and arguments
    feed_func = feedSeqQueue
    feed_args = {'seq_file': seq_file}
    # Define worker function and arguments
    work_func = processSeqQueue
    work_args = {'process_func': filter_func, 
                 'process_args': filter_args}
    # Define collector function and arguments
    collect_func = collectSeqQueue
    collect_args = {'seq_file': seq_file,
                    'task_label': cmd_dict[filter_func],
                    'out_args': out_args}
    
    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, 
                             feed_args, work_args, collect_args, 
                             nproc, queue_size)
        
    # Print log
    result['log']['END'] = 'FilterSeq'
    printLog(result['log'])
        
    return result['out_files']
Exemple #4
0
def readRepo(repo):
    """
    Parses germline repositories

    Arguments:
      repo : String list of directories and/or files
             from which to read germline records

    Returns:
      dict : Dictionary of {allele: sequence} germlines
    """
    repo_files = []
    # Iterate over items passed to commandline
    for r in repo:
        # If directory, get fasta files from within
        if os.path.isdir(r):
            repo_files.extend([os.path.join(r, f) for f in os.listdir(r) \
                          if getFileType(f) == 'fasta'])
        # If file, make sure file is fasta
        if os.path.isfile(r) and getFileType(r) == 'fasta':
            repo_files.extend([r])

    # Catch instances where no valid fasta files were passed in
    if len(repo_files) < 1:
        sys.exit('\nERROR: No valid germline fasta files (.fasta, .fna, .fa) were found in %s' \
                 % ','.join(repo))

    repo_dict = {}
    for file_name in repo_files:
        with open(file_name, 'rU') as file_handle:
            germlines = SeqIO.parse(file_handle, 'fasta')
            for g in germlines:
                germ_key = parseAllele(g.description, allele_regex, 'first')
                repo_dict[germ_key] = str(g.seq).upper()

    return repo_dict
def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None,
                copy_fields=None, copy_actions=None, max_field=None, min_field=None, 
                inner=False, keep_missing=False, out_args=default_out_args):
    """
    Removes duplicate sequences from a file

    Arguments: 
    seq_file = filename of the sequence file to sample from
    max_missing = number of ambiguous characters to allow in a unique sequence
    uniq_fields = a list of annotations that define a sequence as unique if they differ
    copy_fields = a list of annotations to copy into unique sequence annotations
    copy_actions = the list of collapseAnnotation actions to take on copy_fields 
    max_field = a numeric field whose maximum value determines the retained sequence
    min_field = a numeric field whose minimum value determines the retained sequence
    inner = if True exclude consecutive outer ambiguous characters from iterations and matching
    keep_missing = if True retain sequences with more ambiguous characters than max_missing as unique
    out_args = common output argument dictionary from parseCommonArgs
              
    Returns: 
    the collapsed output file name
    """
    log = OrderedDict()
    log['START'] = 'CollapseSeq'
    log['FILE'] = os.path.basename(seq_file)
    log['MAX_MISSING'] = max_missing
    log['UNIQ_FIELDS'] = ','.join([str(x) for x in uniq_fields]) \
                         if uniq_fields is not None else None
    log['COPY_FIELDS'] = ','.join([str(x) for x in copy_fields]) \
                         if copy_fields is not None else None
    log['COPY_ACTIONS'] = ','.join([str(x) for x in copy_actions]) \
                          if copy_actions is not None else None
    log['MAX_FIELD'] = max_field
    log['MIN_FIELD'] = min_field
    log['INNER'] = inner
    log['KEEP_MISSING'] = keep_missing
    printLog(log)
    
    # TODO:  storing all sequences in memory is faster
    # Read input file
    in_type = getFileType(seq_file)
    #seq_dict = readSeqFile(seq_file, index=True)
    seq_dict = SeqIO.to_dict(readSeqFile(seq_file, index=False))
    if out_args['out_type'] is None:  out_args['out_type'] = in_type

    # Count total sequences
    rec_count = len(seq_dict)

    # Define log handle
    if out_args['log_file'] is None:  
        log_handle = None
    else:  
        log_handle = open(out_args['log_file'], 'w')

    # Find sequences with duplicates
    uniq_dict = {}
    # Added list typing for compatibility issue with Python 2.7.5 on OS X
    # TypeError: object of type 'dictionary-keyiterator' has no len()
    search_keys = list(seq_dict.keys())
    dup_keys = []
    for n in range(0, max_missing + 1):
        # Find unique sequences
        uniq_dict, search_keys, dup_list = findUniqueSeq(uniq_dict, search_keys, seq_dict, n, 
                                                         uniq_fields, copy_fields,
                                                         max_field, min_field, inner, 
                                                         out_args['delimiter'])
        # Update list of duplicates
        dup_keys.extend(dup_list)

        # Update log
        log = OrderedDict()
        log['ITERATION'] = n + 1
        log['MISSING'] = n 
        log['UNIQUE'] = len(uniq_dict) 
        log['DUPLICATE'] = len(dup_keys) 
        log['UNDETERMINED'] = len(search_keys)
        printLog(log, handle=log_handle)
                
        # Break if no keys to search remain
        if len(search_keys) == 0:  break
    
    # Write unique sequences
    with getOutputHandle(seq_file, 'collapse-unique', out_dir=out_args['out_dir'], 
                         out_name=out_args['out_name'], out_type=out_args['out_type']) \
            as uniq_handle:
        for val in uniq_dict.values():
            # Define output sequence
            out_seq = val[0]
            out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter'])
            out_app = OrderedDict()
            if copy_fields  is not None and copy_actions is not None:
                for f, a, s in zip(copy_fields, copy_actions, val[3:]):
                    out_app[f] = s
                    out_app = collapseAnnotation(out_app, a, f, delimiter=out_args['delimiter'])
                    out_ann.pop(f, None)
            out_app['DUPCOUNT'] = val[1]
            out_ann = mergeAnnotation(out_ann, out_app, delimiter=out_args['delimiter'])
            out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter'])
            out_seq.description = ''
            # Write unique sequence
            SeqIO.write(out_seq, uniq_handle, out_args['out_type'])
    
        # Write sequence with high missing character counts
        if keep_missing:
            for k in search_keys:
                out_seq = seq_dict[k]
                out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter'])
                out_ann = mergeAnnotation(out_ann, {'DUPCOUNT':1}, delimiter=out_args['delimiter'])
                out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter'])
                out_seq.description = ''
                SeqIO.write(out_seq, uniq_handle, out_args['out_type'])

    # Write sequence with high missing character counts
    if out_args['failed'] and not keep_missing:
        with getOutputHandle(seq_file, 'collapse-undetermined', out_dir=out_args['out_dir'],
                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
                as missing_handle:
            for k in search_keys:
                SeqIO.write(seq_dict[k], missing_handle, out_args['out_type'])

    if out_args['failed']:
        # Write duplicate sequences 
        with getOutputHandle(seq_file, 'collapse-duplicate', out_dir=out_args['out_dir'], 
                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
                as dup_handle:
            for k in dup_keys:
                SeqIO.write(seq_dict[k], dup_handle, out_args['out_type'])

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(uniq_handle.name)
    log['SEQUENCES'] = rec_count
    log['UNIQUE'] = len(uniq_dict)
    log['DUPLICATE'] = len(dup_keys)
    log['UNDETERMINED'] = len(search_keys)
    log['END'] = 'CollapseSeq'
    printLog(log)
        
    # Close file handles
    if log_handle is not None:  log_handle.close()
    
    return uniq_handle.name
Exemple #6
0
def parseCommonArgs(args, in_arg=None, in_types=None):
    """
    Checks common arguments from getCommonArgParser and transforms output options to a dictionary

    Arguments: 
      args : Argument Namespace defined by ArgumentParser.parse_args
      in_arg : String defining a non-standard input file argument to verify;
               by default ['db_files', 'seq_files', 'seq_files_1', 'seq_files_2', 'primer_file']
               are supported in that order
      in_types : List of types (file extensions as strings) to allow for files in file_arg
                 if None do not check type
                    
    Returns:
      dict : Dictionary copy of args with output arguments embedded in the dictionary out_args
    """ 
    db_types = ['tab']
    seq_types = ['fasta', 'fastq']
    primer_types = ['fasta']
    if in_types is not None:  in_types = [f.lower for f in in_types]
    args_dict = args.__dict__.copy()
    
    # Count input files
    if 'seq_files' in args_dict:
        input_count = len(args_dict['seq_files']  or [])
        input_files = args_dict['seq_files']
    elif all([k in args_dict for k in ('seq_files_1', 'seq_files_2')]):
        input_count = len(args_dict['seq_files_1']  or [])
        input_files = args_dict['seq_files_1'] + args_dict['seq_files_2']
    elif 'db_files' in args_dict:
        input_count = len(args_dict['db_files'] or [])
        input_files = args_dict['db_files']
    elif 'primer_file' in args_dict:
        input_count = 1
        input_files = args_dict['primer_file']
    elif in_arg is not None and in_arg in args_dict: 
        input_count = len(args_dict[in_arg] or [])
        input_files = args_dict[in_arg]
    else:
        printError('Cannot determine input file argument.')

    # Exit if output names or log files are specified with multiple input files    
    if args_dict.get('out_name', None) is not None and input_count > 1:
        printError('The --outname argument may not be specified with multiple input files.')
    if args_dict.get('log_file', None) is not None and input_count > 1:
        printError('The --log argument may not be specified with multiple input files.')

    # Verify single-end sequence files
    if 'seq_files' in args_dict and args_dict['seq_files']:
        for f in args_dict['seq_files']:
            if not os.path.isfile(f):
                printError('Sequence file %s does not exist.' % f)
            if getFileType(f) not in seq_types:
                printError('Sequence file %s is not a supported type. Must be one: %s.' \
                           % (f, ', '.join(seq_types)))
    
    # Verify paired-end sequence files
    if all([k in args_dict and args_dict[k] for k in ('seq_files_1', 'seq_files_2')]):
        if len(args_dict['seq_files_1']) != len(args_dict['seq_files_2']):
            printError('The -1 and -2 arguments must contain the same number of files.')
        for f1, f2 in zip(args_dict['seq_files_1'], args_dict['seq_files_2']):
            if getFileType(f1) != getFileType(f2):
                printError('Each pair of files in the -1 and -2 arguments must be the same file type.')
        for f in (args_dict['seq_files_1'] + args_dict['seq_files_2']):
            if not os.path.isfile(f):
                printError('Sequence file %s does not exist.' % f)
            if getFileType(f) not in seq_types:
                printError('Sequence file %s is not a supported type. Must be one: %s.' \
                           % (f, ', '.join(seq_types)))

    # Verify database files
    if 'db_files' in args_dict and args_dict['db_files']:
        for f in args_dict['db_files']:
            if not os.path.isfile(f):
                printError('Database file %s does not exist.' % f)
            if getFileType(f) not in db_types:
                printError('Database file %s is not a supported type. Must be one: %s.' \
                           % (f, ', '.join(db_types)))

    # Verify primer file
    if 'primer_file' in args_dict and args_dict['primer_file']:
        primer_file = args_dict['primer_file']
        if not os.path.isfile(primer_file):
            printError('Primer file %s does not exist.' % primer_file)
        if getFileType(primer_file) not in primer_types:
            printError('Primer file %s is not a supported type. Must be one: %s.' \
                       % (primer_file, ', '.join(primer_types)))

    # Verify non-standard input files
    if in_arg is not None and in_arg in args_dict and args_dict[in_arg]:
        files = args_dict[in_arg] if isinstance(args_dict[in_arg], list) \
                else [args_dict[in_arg]]
        for f in files:
            if not os.path.exists(f):
                printError('Input %s does not exist.' % f)
            if in_types is not None and getFileType(f) not in in_types:
                printError('Input %s is not a supported type. Must be one: %s.' \
                           % (f, ', '.join(in_types)))
    
    # Verify output file arguments and exit if anything is hinky
    if args_dict.get('out_files', None) is not None \
            or args_dict.get('out_file', None) is not None:
        if args_dict.get('out_dir', None) is not None:
            printError('The -o argument may not be specified with the --outdir argument.')
        if args_dict.get('out_name', None) is not None:
            printError('The -o argument may not be specified with the --outname argument.')
        if args_dict.get('failed', False):
            printError('The -o argument may not be specified with the --failed argument.')
    if args_dict.get('out_files', None) is not None:
        if len(args_dict['out_files']) != input_count:
            printError('The -o argument requires one output file name per input file.')
        for f in args_dict['out_files']:
            if f in input_files:
                printError('Output files and input files cannot have the same names.')
        for f in args_dict['out_files']:
            if os.path.isfile(f):
                printWarning('Output file %s already exists and will be overwritten.' % f)
    if args_dict.get('out_file', None) is not None:
        if args_dict['out_file'] in input_files:
            printError('Output files and input files cannot have the same names.')
        if os.path.isfile(args_dict['out_file']):
            printWarning('Output file %s already exists and will be overwritten.' % args_dict['out_file'])

    # Verify output directory
    if 'out_dir' in args_dict and args_dict['out_dir']:
        if os.path.exists(args_dict['out_dir']) and not os.path.isdir(args_dict['out_dir']):
            printError('Directory %s exists but it is not a directory.' % args_dict['out_dir'])

    # Redefine common output options as out_args dictionary
    out_args = ['log_file', 'delimiter', 'separator', 
                'out_dir', 'out_name', 'out_type', 'failed']
    args_dict['out_args'] = {k:args_dict.setdefault(k, None) for k in out_args}
    for k in out_args: del args_dict[k]
    
    return args_dict
def collectQueueClust(alive, result_queue, collect_queue, db_file, out_args, cluster_func, cluster_args):
    """
    Assembles results from a queue of individual sequence results and manages log/file I/O

    Arguments: 
    alive = a multiprocessing.Value boolean controlling whether processing continues
            if False exit process
    result_queue = a multiprocessing.Queue holding processQueue results
    collect_queue = a multiprocessing.Queue to store collector return values
    db_file = the input database file name
    out_args = common output argument dictionary from parseCommonArgs
    cluster_func = the function to call for carrying out clustering on distance matrix
    cluster_args = a dictionary of arguments to pass to cluster_func
    
    Returns: 
    None
    (adds 'log' and 'out_files' to collect_dict)
    """
    # Open output files
    try:
               
        # Iterate over Ig records to count and order by junction length
        result_count = 0
        records = {}
        # print 'Reading file...'
        db_iter = readDbFile(db_file)
        for rec in db_iter:
            records[rec.id] = rec
            result_count += 1
        records = OrderedDict(sorted(list(records.items()), key=lambda i: i[1].junction_length))
                
        # Define empty matrix to store assembled results
        dist_mat = np.zeros((result_count,result_count))
        
        # Count records and define output format 
        out_type = getFileType(db_file) if out_args['out_type'] is None \
                   else out_args['out_type']
                   
        # Defined successful output handle
        pass_handle = getOutputHandle(db_file, 
                                      out_label='clone-pass', 
                                      out_dir=out_args['out_dir'], 
                                      out_name=out_args['out_name'], 
                                      out_type=out_type)
        pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE')
        
        # Defined failed cloning output handle
        if out_args['failed']:
            fail_handle = getOutputHandle(db_file,
                                          out_label='clone-fail', 
                                          out_dir=out_args['out_dir'], 
                                          out_name=out_args['out_name'], 
                                          out_type=out_type)
            fail_writer = getDbWriter(fail_handle, db_file)
        else:
            fail_handle = None
            fail_writer = None

        # Open log file
        if out_args['log_file'] is None:
            log_handle = None
        else:
            log_handle = open(out_args['log_file'], 'w')
    except:
        alive.value = False
        raise
    
    try:
        # Iterator over results queue until sentinel object reached
        start_time = time()
        row_count = rec_count = 0
        while alive.value:
            # Get result from queue
            if result_queue.empty():  continue
            else:  result = result_queue.get()
            # Exit upon reaching sentinel
            if result is None:  break

            # Print progress for previous iteration
            if row_count == 0:
                print('PROGRESS> Assigning clones')
            printProgress(row_count, result_count, 0.05, start_time)
            
            # Update counts for iteration
            row_count += 1
            rec_count += len(result)
            
            # Add result row to distance matrix
            if result:
                dist_mat[list(range(result_count-len(result),result_count)),result_count-len(result)] = result.results
                
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None    
        
        # Calculate linkage and carry out clustering
        # print dist_mat
        clusters = cluster_func(dist_mat, **cluster_args) if dist_mat is not None else None
        clones = {}
        # print clusters
        for i, c in enumerate(clusters):
            clones.setdefault(c, []).append(records[list(records.keys())[i]])
        
        # Write passed and failed records
        clone_count = pass_count = fail_count = 0
        if clones:
            for clone in clones.values():
                clone_count += 1
                for i, rec in enumerate(clone):
                    rec.annotations['CLONE'] = clone_count
                    pass_writer.writerow(rec.toDict())
                    pass_count += 1
                    #result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction)

        else:
            for i, rec in enumerate(result.data):
                fail_writer.writerow(rec.toDict())
                fail_count += 1
                #result.log['CLONE0-%i' % (i + 1)] = str(rec.junction)
        
        # Print final progress
        printProgress(row_count, result_count, 0.05, start_time)
    
        # Close file handles
        pass_handle.close()
        if fail_handle is not None:  fail_handle.close()
        if log_handle is not None:  log_handle.close()
                
        # Update return list
        log = OrderedDict()
        log['OUTPUT'] = os.path.basename(pass_handle.name)
        log['CLONES'] = clone_count
        log['RECORDS'] = rec_count
        log['PASS'] = pass_count
        log['FAIL'] = fail_count
        collect_dict = {'log':log, 'out_files': [pass_handle.name]}
        collect_queue.put(collect_dict)
    except:
        alive.value = False
        raise
    
    return None
Exemple #8
0
def selectSeqFile(seq_file,
                  field,
                  value_list=None,
                  value_file=None,
                  negate=False,
                  out_file=None,
                  out_args=default_out_args):
    """
    Select from a sequence file

    Arguments:
      seq_file : filename of the sequence file to sample from.
      field : the annotation field to check for required values.
      value_list : a list of annotation values that a sample must contain one of.
      value_file : a tab delimited file containing values to select.
      negate : if True select entires that do not contain the specific values.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      str: output file name.
    """

    # Reads value_file
    def _read_file(value_file, field):
        field_list = []
        try:
            with open(value_file, 'rt') as handle:
                reader_dict = csv.DictReader(handle, dialect='excel-tab')
                for row in reader_dict:
                    field_list.append(row[field])
        except IOError:
            printError('File %s cannot be read.' % value_file)
        except:
            printError('File %s is invalid.' % value_file)

        return field_list

    # Print console log
    log = OrderedDict()
    log['START'] = 'SplitSeq'
    log['COMMAND'] = 'select'
    log['FILE'] = os.path.basename(seq_file)
    log['FIELD'] = field
    if value_list is not None:
        log['VALUE_LIST'] = ','.join([str(x) for x in value_list])
    if value_file is not None:
        log['VALUE_FILE'] = os.path.basename(value_file)
    log['NOT'] = negate
    printLog(log)

    # Read value_file
    if value_list is not None and value_file is not None:
        printError('Specify only one of value_list and value_file.')
    elif value_file is not None:
        value_list = _read_file(value_file, field)

    # Read sequence file
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None: out_args['out_type'] = in_type

    # Output output handle
    if out_file is not None:
        out_handle = open(out_file, 'w')
    else:
        out_handle = getOutputHandle(seq_file,
                                     'selected',
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type=out_args['out_type'])

    # Generate subset of records
    start_time = time()
    pass_count, fail_count, rec_count = 0, 0, 0
    value_set = set(value_list)
    for rec in seq_iter:
        printCount(rec_count, 1e5, start_time=start_time)
        rec_count += 1

        # Parse annotations into a list of values
        ann = parseAnnotation(rec.description,
                              delimiter=out_args['delimiter'])[field]
        ann = ann.split(out_args['delimiter'][2])

        # Write
        if xor(negate, not value_set.isdisjoint(ann)):
            # Write
            SeqIO.write(rec, out_handle, out_args['out_type'])
            pass_count += 1
        else:
            fail_count += 1

    printCount(rec_count, 1e5, start_time=start_time, end=True)

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'SplitSeq'
    printLog(log)

    return out_handle.name
def pairSeq(seq_file_1, seq_file_2, fields_1=None, fields_2=None,
            coord_type=default_coord_type,
            out_args=default_out_args):
    """
    Generates consensus sequences

    Arguments: 
    seq_file_1 = the file containing the grouped sequences and annotations
    seq_file_2 = the file to assign annotations to from seq_file_1
    fields_1 = list of annotations in seq_file_1 records to copy to seq_file_2 records;
               if None do not copy any annotations
    fields_2 = list of annotations in seq_file_2 records to copy to seq_file_1 records;
               if None do not copy any annotations
    coord_type = the sequence header format
    out_args = common output argument dictionary from parseCommonArgs
                    
    Returns: 
    a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2)
    """
    # Define private functions
    def _key_func(x):
        return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter'])

    log = OrderedDict()
    log['START'] = 'PairSeq'
    log['FILE1'] = os.path.basename(seq_file_1)
    log['FILE2'] = os.path.basename(seq_file_2)
    log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None
    log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None
    log['COORD_TYPE'] = coord_type
    printLog(log)

    # Define output type
    if out_args['out_type'] is None:
        out_type_1 = getFileType(seq_file_1)
        out_type_2 = getFileType(seq_file_2)
    else: 
        out_type_1 = out_type_2 = out_args['out_type']

    # Define output name
    if out_args['out_name'] is None:
        out_name_1 = out_name_2 = None
    else: 
        out_name_1 = '%s-1' % out_args['out_name']
        out_name_2 = '%s-2' % out_args['out_name']

    # Open and count files
    start_time = time()
    printMessage("Indexing files", start_time=start_time)
    # Index file 1
    seq_count_1 = countSeqFile(seq_file_1)
    seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func)
    # Define file 2 iterator
    seq_count_2 = countSeqFile(seq_file_2)
    seq_iter_2 = readSeqFile(seq_file_2, index=False)
    printMessage("Done", start_time=start_time, end=True)

    # Open output file handles
    pass_handle_1 = getOutputHandle(seq_file_1, 'pair-pass', out_args['out_dir'], 
                                    out_name=out_name_1, out_type=out_type_1)
    pass_handle_2 = getOutputHandle(seq_file_2, 'pair-pass', out_args['out_dir'], 
                                    out_name=out_name_2, out_type=out_type_2)

    if out_args['failed']:
        fail_handle_1 = getOutputHandle(seq_file_1, 'pair-fail', out_dir=out_args['out_dir'],
                                        out_name=out_name_1, out_type=out_type_1)
        fail_handle_2 = getOutputHandle(seq_file_2, 'pair-fail', out_dir=out_args['out_dir'],
                                        out_name=out_name_2, out_type=out_type_2)
        pass_keys = list()

    # Iterate over pairs and write to output files
    start_time = time()
    rec_count = pair_count = 0
    for seq_2 in seq_iter_2:
        # Print progress for previous iteration
        printProgress(rec_count, seq_count_2, 0.05, start_time)
        rec_count += 1

        # Check for file 2 mate pair in file 1
        coord_2 = getCoordKey(seq_2.id, coord_type=coord_type,
                              delimiter=out_args['delimiter'])
        seq_1 = seq_dict_1.get(coord_2, None)

        if seq_1 is not None:
            # Record paired keys
            pair_count += 1

            if fields_1 is not None or fields_2 is not None:
                ann_1 = parseAnnotation(seq_1.description,
                                        delimiter=out_args['delimiter'])
                ann_2 = parseAnnotation(seq_2.description,
                                        delimiter=out_args['delimiter'])

                # Prepend annotations from seq_1 to seq_2
                if fields_1 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \
                                            if k in fields_1])
                    merge_ann = mergeAnnotation(ann_2, copy_ann, prepend=True,
                                                delimiter=out_args['delimiter'])
                    seq_2.id = flattenAnnotation(merge_ann,
                                                 delimiter=out_args['delimiter'])
                    seq_2.description = ''

                # Append annotations from seq_2 to seq_1
                if fields_2 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \
                                            if k in fields_2])
                    merge_ann = mergeAnnotation(ann_1, copy_ann, prepend=False,
                                                delimiter=out_args['delimiter'])
                    seq_1.id = flattenAnnotation(merge_ann,
                                                 delimiter=out_args['delimiter'])
                    seq_1.description = ''

            # Write paired records
            SeqIO.write(seq_1, pass_handle_1, out_type_1)
            SeqIO.write(seq_2, pass_handle_2, out_type_2)

        # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records
        if out_args['failed']:
            if seq_1 is not None:  pass_keys.append(coord_2)
            else:  SeqIO.write(seq_2, fail_handle_2, out_type_2)

    # Print final progress
    printProgress(rec_count, seq_count_2, 0.05, start_time)

    # Find and write unpaired file 1 records
    if out_args['failed']:
        start_time = time()
        printMessage("Finding unpaired", start_time=start_time)

        # Find file 1 unpaired keys
        pass_keys = set(pass_keys)
        unpaired = set(seq_dict_1).difference(pass_keys)
        # Write unpaired file 1 records
        for k in unpaired:  SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1)

        printMessage("Done", start_time=start_time, end=True)

    # Print log
    log = OrderedDict()
    log['OUTPUT1'] = os.path.basename(pass_handle_1.name)
    log['OUTPUT2'] = os.path.basename(pass_handle_2.name)
    log['SEQUENCES1'] = seq_count_1
    log['SEQUENCES2'] = seq_count_2
    log['PASS'] = pair_count
    log['END'] = 'PairSeq'
    printLog(log)
   
    # Close file handles
    pass_handle_1.close()
    pass_handle_2.close()

    return [(pass_handle_1.name, pass_handle_2.name)]
Exemple #10
0
def sampleSeqFile(seq_file,
                  max_count,
                  field=None,
                  values=None,
                  out_args=default_out_args):
    """
    Samples from a sequence file

    Arguments: 
      seq_file : filename of the sequence file to sample from
      max_count : a list of the maximum number of sequences to sample
      field : the annotation field to check for required values
      values : a list of annotation values that a sample must contain one of
      out_args : common output argument dictionary from parseCommonArgs
              
    Returns: 
      str: output file name
    """

    # Function to sample from a list of sequence indices
    def _sample_list(n, index_list):
        max_n = len(index_list)
        r = random.sample(range(max_n), n) if n < max_n else range(max_n)
        return [index_list[x] for x in r]

    # Function to sample from a dictionary of grouped sequence indices
    def _sample_dict(n, index_dict):
        sample_list = []
        for v in index_dict.values():
            max_n = len(v)
            r = random.sample(range(max_n), n) if n < max_n else range(max_n)
            sample_list.extend([v[x] for x in r])
        return sample_list

    # Print console log
    log = OrderedDict()
    log['START'] = 'SplitSeq'
    log['COMMAND'] = 'sample'
    log['FILE'] = os.path.basename(seq_file)
    log['MAX_COUNTS'] = ','.join([str(x) for x in max_count])
    log['FIELD'] = field
    log['VALUES'] = ','.join(values) if values else None
    printLog(log)

    # Read input files and open output files
    start_time = time()
    printMessage('Reading files', start_time=start_time, width=25)

    in_type = getFileType(seq_file)
    seq_dict = readSeqFile(seq_file, index=True)
    if out_args['out_type'] is None: out_args['out_type'] = in_type

    # Generate subset of records
    if field is not None and values is not None:
        _sample = _sample_list
        printMessage('Subsetting by annotation',
                     start_time=start_time,
                     width=25)
        seq_index = subsetSeqIndex(seq_dict,
                                   field,
                                   values,
                                   delimiter=out_args['delimiter'])
    elif field is not None and values is None:
        _sample = _sample_dict
        printMessage('Indexing by annotation', start_time=start_time, width=25)
        seq_index = indexSeqSets(seq_dict,
                                 field,
                                 delimiter=out_args['delimiter'])
    else:
        _sample = _sample_list
        seq_index = [k for k in seq_dict]

    printMessage('Done', start_time=start_time, end=True, width=25)

    # Generate sample set for each value in max_count
    out_files = []
    for i, n in enumerate(max_count):
        start_time = time()
        printMessage('Sampling n=%i' % n, start_time=start_time, width=25)
        # Sample from records
        sample_keys = _sample(n, seq_index)
        sample_count = len(sample_keys)

        # Write sampled sequences to files
        with getOutputHandle(seq_file,
                             'sample%i-n%i' % (i + 1, sample_count),
                             out_dir=out_args['out_dir'],
                             out_name=out_args['out_name'],
                             out_type=out_args['out_type']) as out_handle:
            for k in sample_keys:
                SeqIO.write(seq_dict[k], out_handle, out_args['out_type'])
            out_files.append(out_handle.name)

        printMessage('Done', start_time=start_time, end=True, width=25)

        # Print log for iteration
        log = OrderedDict()
        log['MAX_COUNT'] = n
        log['SAMPLED'] = sample_count
        log['OUTPUT'] = os.path.basename(out_files[i])
        printLog(log)

    # Print log
    log = OrderedDict()
    log['END'] = 'SplitSeq'
    printLog(log)

    return out_files
Exemple #11
0
def samplePairSeqFile(seq_file_1,
                      seq_file_2,
                      max_count,
                      field=None,
                      values=None,
                      coord_type=default_coord,
                      out_args=default_out_args):
    """
    Samples from paired-end sequence files

    Arguments: 
      seq_file_1 : filename of the first paired-end sequence file
      seq_file_2 : filename of the second paired-end sequence file
      max_count : a list of the maximum number of sequences to sample
      field : the annotation field to check for required values
      values : a list of annotation values that a sample must contain one of
      coord_type : the sequence header format
      out_args : common output argument dictionary from parseCommonArgs
              
    Returns: 
      list: seq_file_1 and seq_file_2 output file names
    """

    # Sequence index key function
    def _key_func(x):
        return getCoordKey(x,
                           coord_type=coord_type,
                           delimiter=out_args['delimiter'])

    # Function to sample from two lists of sequence indices
    def _sample_list(n, index_1, index_2):
        key_set = set(index_1).intersection(index_2)
        max_n = len(key_set)
        return random.sample(key_set, min(n, max_n))

    # Function to sample from two dictionaries of grouped sequence indices
    def _sample_dict(n, index_1, index_2):
        group_set = set(index_1.keys()).intersection(index_2.keys())
        sample_list = []
        for k in group_set:
            key_set = set(index_1[k]).intersection(index_2[k])
            max_n = len(key_set)
            sample_list.extend(random.sample(key_set, min(n, max_n)))
        return sample_list

    # Print console log
    log = OrderedDict()
    log['START'] = 'SplitSeq'
    log['COMMAND'] = 'samplepair'
    log['FILE1'] = os.path.basename(seq_file_1)
    log['FILE2'] = os.path.basename(seq_file_2)
    log['MAX_COUNTS'] = ','.join([str(x) for x in max_count])
    log['FIELD'] = field
    log['VALUES'] = ','.join(values) if values else None
    printLog(log)

    # Define output type
    in_type_1 = getFileType(seq_file_1)
    in_type_2 = getFileType(seq_file_2)
    if out_args['out_type'] is None:
        out_type_1 = in_type_1
        out_type_2 = in_type_2
    else:
        out_type_1 = out_type_2 = out_args['out_type']

    # Define output name
    if out_args['out_name'] is None:
        out_name_1 = out_name_2 = None
    else:
        out_name_1 = '%s-1' % out_args['out_name']
        out_name_2 = '%s-2' % out_args['out_name']

    # Index input files
    start_time = time()
    printMessage('Reading files', start_time=start_time, width=25)

    seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func)
    seq_dict_2 = readSeqFile(seq_file_2, index=True, key_func=_key_func)

    # Subset keys to those meeting field/value criteria
    if field is not None and values is not None:
        _sample = _sample_list
        printMessage('Subsetting by annotation',
                     start_time=start_time,
                     width=25)
        seq_index_1 = subsetSeqIndex(seq_dict_1,
                                     field,
                                     values,
                                     delimiter=out_args['delimiter'])
        seq_index_2 = subsetSeqIndex(seq_dict_2,
                                     field,
                                     values,
                                     delimiter=out_args['delimiter'])
    elif field is not None and values is None:
        _sample = _sample_dict
        printMessage('Indexing by annotation', start_time=start_time, width=25)
        seq_index_1 = indexSeqSets(seq_dict_1,
                                   field,
                                   delimiter=out_args['delimiter'])
        seq_index_2 = indexSeqSets(seq_dict_2,
                                   field,
                                   delimiter=out_args['delimiter'])
    else:
        _sample = _sample_list
        seq_index_1 = list(seq_dict_1.keys())
        seq_index_2 = list(seq_dict_2.keys())

    printMessage('Done', start_time=start_time, end=True, width=25)

    # Generate sample set for each value in max_count
    out_files = []
    for i, n in enumerate(max_count):
        start_time = time()
        printMessage('Sampling n=%i' % n, start_time=start_time, width=25)
        # Sample
        sample_keys = _sample(n, seq_index_1, seq_index_2)
        sample_count = len(sample_keys)

        # Open file handles
        out_handle_1 = getOutputHandle(seq_file_1,
                                       'sample%i-n%i' % (i + 1, sample_count),
                                       out_dir=out_args['out_dir'],
                                       out_name=out_name_1,
                                       out_type=out_type_1)
        out_handle_2 = getOutputHandle(seq_file_2,
                                       'sample%i-n%i' % (i + 1, sample_count),
                                       out_dir=out_args['out_dir'],
                                       out_name=out_name_2,
                                       out_type=out_type_2)
        out_files.append((out_handle_1.name, out_handle_2.name))

        for k in sample_keys:
            SeqIO.write(seq_dict_1[k], out_handle_1, out_type_1)
            SeqIO.write(seq_dict_2[k], out_handle_2, out_type_2)

        printMessage('Done', start_time=start_time, end=True, width=25)

        # Print log for iteration
        log = OrderedDict()
        log['MAX_COUNT'] = n
        log['SAMPLED'] = sample_count
        log['OUTPUT1'] = os.path.basename(out_files[i][0])
        log['OUTPUT2'] = os.path.basename(out_files[i][1])
        printLog(log)

        # Close file handles
        out_handle_1.close()
        out_handle_2.close()

    # Print log
    log = OrderedDict()
    log['END'] = 'SplitSeq'
    printLog(log)

    return out_files
Exemple #12
0
def modifyHeaders(seq_file, modify_func, modify_args, out_file=None, out_args=default_out_args):
    """
    Modifies sequence headers

    Arguments: 
      seq_file : the sequence file name.
      modify_func : the function defining the modification operation.
      modify_args : a dictionary of arguments to pass to modify_func.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
                    
    Returns: 
      str: output file name.
    """
    # Define subcommand label dictionary
    cmd_dict = {addHeader: 'add',
                copyHeader: 'copy',
                collapseHeader: 'collapse',
                deleteHeader: 'delete',
                expandHeader: 'expand',
                renameHeader: 'rename'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'ParseHeaders'
    log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__)
    log['FILE'] = os.path.basename(seq_file)
    for k in sorted(modify_args):  
        v = modify_args[k]
        log[k.upper()] = ','.join(v) if isinstance(v, list) else v
    printLog(log)
    
    # Open file handles
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type
    if out_file is not None:
        out_handle = open(out_file, 'w')
    else:
        out_handle = getOutputHandle(seq_file,
                                     'reheader',
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type=out_args['out_type'])
    # Count records
    result_count = countSeqFile(seq_file)

    # Iterate over sequences
    start_time = time()
    seq_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, result_count, 0.05, start_time=start_time)
        
        #Update counts
        seq_count += 1
        
        # Modify header
        header = parseAnnotation(seq.description, delimiter=out_args['delimiter'])
        header = modify_func(header, delimiter=out_args['delimiter'], **modify_args)
        
        # Write new sequence
        seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter'])
        seq.description = ''
        SeqIO.write(seq, out_handle, out_args['out_type'])
        
    # print counts
    printProgress(seq_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['SEQUENCES'] = seq_count
    log['END'] = 'ParseHeaders'               
    printLog(log)

    # Close file handles
    out_handle.close()
 
    return out_handle.name
Exemple #13
0
def collectPairQueue(alive,
                     result_queue,
                     collect_queue,
                     seq_file_1,
                     seq_file_2,
                     label,
                     out_file=None,
                     out_args=default_out_args):
    """
    Pulls from results queue, assembles results and manages log and file IO

    Arguments:
      alive : a multiprocessing.Value boolean controlling whether processing
              continues; when False function returns.
      result_queue : a multiprocessing.Queue holding worker results.
      collect_queue : a multiprocessing.Queue holding collector return values.
      seq_file_1 : the first sequence file name.
      seq_file_2 : the second sequence file name.
      label : task label used to tag the output files.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      None: adds a dictionary of {log: log object, out_files: output file names} to collect_queue.
    """
    # Define output format
    out_type = getFileType(seq_file_1) if out_args['out_type'] is None \
        else out_args['out_type']

    # Define output names
    if out_args['out_name'] is None:
        out_name_1, out_name_2 = None, None
    else:
        out_name_1 = '%s-1' % out_args['out_name']
        out_name_2 = '%s-2' % out_args['out_name']

    # Wrapper for opening handles and writers
    def _open(x, in_file, out_name, out_file=out_file):
        if out_file is not None and x == 'pass':
            handle = open(out_file, 'w')
        else:
            handle = getOutputHandle(in_file,
                                     out_label='%s-%s' % (label, x),
                                     out_dir=out_args['out_dir'],
                                     out_name=out_name,
                                     out_type=out_type)
        return handle

    try:
        # Count input size
        result_count = countSeqFile(seq_file_1)

        # Define log handle
        if out_args['log_file'] is None:
            log_handle = None
        else:
            log_handle = open(out_args['log_file'], 'w')
    except:
        alive.value = False
        raise

    try:
        # Initialize file handles
        pass_handle, fail_handle_1, fail_handle_2 = None, None, None

        # Iterator over results queue until sentinel object reached
        start_time = time()
        iter_count = pass_count = fail_count = 0
        while alive.value:
            # Get result from queue
            if result_queue.empty():
                continue
            else:
                result = result_queue.get()
            # Exit upon reaching sentinel
            if result is None: break

            # Print progress for previous iteration
            printProgress(iter_count,
                          result_count,
                          0.05,
                          start_time=start_time)

            # Update counts for iteration
            iter_count += 1

            # Write log
            printLog(result.log, handle=log_handle)

            # Write assembled sequences
            if result:
                pass_count += 1
                try:
                    SeqIO.write(result.results, pass_handle, out_type)
                except AttributeError:
                    # Open pass file
                    pass_handle = _open('pass', seq_file_1,
                                        out_args['out_name'])
                    SeqIO.write(result.results, pass_handle, out_type)
            else:
                fail_count += 1
                if out_args['failed']:
                    try:
                        SeqIO.write(result.data[0], fail_handle_1, out_type)
                        SeqIO.write(result.data[1], fail_handle_2, out_type)
                    except AttributeError:
                        # Open fail file
                        fail_handle_1 = _open('fail', seq_file_1, out_name_1)
                        fail_handle_2 = _open('fail', seq_file_2, out_name_2)
                        SeqIO.write(result.data[0], fail_handle_1, out_type)
                        SeqIO.write(result.data[1], fail_handle_2, out_type)

        else:
            sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None

        # Print total counts
        printProgress(iter_count, result_count, 0.05, start_time=start_time)

        # Update return values
        log = OrderedDict()
        log['OUTPUT'] = os.path.basename(
            pass_handle.name) if pass_handle is not None else None
        log['PAIRS'] = iter_count
        log['PASS'] = pass_count
        log['FAIL'] = fail_count

        # Close file handles and generate return data
        # collect_dict = {'log': log, 'pass': None, 'fail': None}
        collect_dict = {'log': log, 'out_files': []}
        if pass_handle is not None:
            # collect_dict['pass'] = pass_handle.name
            collect_dict['out_files'].append(pass_handle.name)
            pass_handle.close()
        if fail_handle_1 is not None:
            # collect_dict['fail'] = fail_handle.name
            collect_dict['out_files'].append(fail_handle_1.name)
            fail_handle_1.close()
        if fail_handle_2 is not None:
            # collect_dict['fail'] = fail_handle.name
            collect_dict['out_files'].append(fail_handle_2.name)
            fail_handle_2.close()
        if log_handle is not None:
            log_handle.close()
        collect_queue.put(collect_dict)
    except:
        alive.value = False
        raise

    return None
def collectPairQueue(alive, result_queue, collect_queue, result_count,
                     seq_file_1, seq_file_2, out_args):
    """
    Pulls from results queue, assembles results and manages log and file IO

    Arguments: 
    alive = a multiprocessing.Value boolean controlling whether processing 
            continues; when False function returns
    result_queue = a multiprocessing.Queue holding worker results
    collect_queue = a multiprocessing.Queue holding collector return values
    result_count = the number of expected assembled sequences
    seq_file_1 = the first sequence file name
    seq_file_2 = the second sequence file name
    out_args = common output argument dictionary from parseCommonArgs
    
    Returns: 
    None
    (adds a dictionary of {log: log object, out_files: output file names} to collect_queue)
    """
    try:
        # Count records and define output format 
        out_type = getFileType(seq_file_1) if out_args['out_type'] is None \
                   else out_args['out_type']
        
        # Defined valid assembly output handle
        pass_handle = getOutputHandle(seq_file_1, 
                                      'assemble-pass', 
                                      out_dir=out_args['out_dir'], 
                                      out_name=out_args['out_name'], 
                                      out_type=out_type)
        # Defined failed assembly output handles
        if out_args['failed']:
            # Define output name
            if out_args['out_name'] is None:
                out_name_1 = out_name_2 = None
            else:
                out_name_1 = '%s-1' % out_args['out_name']
                out_name_2 = '%s-2' % out_args['out_name']
            fail_handle_1 = getOutputHandle(seq_file_1,
                                            'assemble-fail',
                                            out_dir=out_args['out_dir'],
                                            out_name=out_name_1,
                                            out_type=out_type)
            fail_handle_2 = getOutputHandle(seq_file_2,
                                            'assemble-fail',
                                            out_dir=out_args['out_dir'],
                                            out_name=out_name_2,
                                            out_type=out_type)
        else:
            fail_handle_1 = None
            fail_handle_2 = None

        # Define log handle
        if out_args['log_file'] is None:
            log_handle = None
        else:
            log_handle = open(out_args['log_file'], 'w')
    except:
        alive.value = False
        raise
    
    try:
        # Iterator over results queue until sentinel object reached
        start_time = time()
        iter_count = pass_count = fail_count = 0
        while alive.value:
            # Get result from queue
            if result_queue.empty():  continue
            else:  result = result_queue.get()
            # Exit upon reaching sentinel
            if result is None:  break

            # Print progress for previous iteration
            printProgress(iter_count, result_count, 0.05, start_time)
    
            # Update counts for iteration
            iter_count += 1
    
            # Write log
            printLog(result.log, handle=log_handle)

            # Write assembled sequences
            if result:
                pass_count += 1
                SeqIO.write(result.results, pass_handle, out_type)
            else:
                fail_count += 1
                if fail_handle_1 is not None and fail_handle_2 is not None:
                    SeqIO.write(result.data[0], fail_handle_1, out_type)
                    SeqIO.write(result.data[1], fail_handle_2, out_type)
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None
        
        # Print total counts
        printProgress(iter_count, result_count, 0.05, start_time)
    
        # Update return values
        log = OrderedDict()
        log['OUTPUT'] = os.path.basename(pass_handle.name)
        log['PAIRS'] = iter_count
        log['PASS'] = pass_count
        log['FAIL'] = fail_count
        collect_dict = {'log':log, 'out_files': [pass_handle.name]}
        collect_queue.put(collect_dict)
        
        # Close file handles
        pass_handle.close()
        if fail_handle_1 is not None:  fail_handle_1.close()
        if fail_handle_2 is not None:  fail_handle_2.close()
        if log_handle is not None:  log_handle.close()
    except:
        alive.value = False
        raise
    
    return None
def estimateError(seq_file, cons_func=frequencyConsensus, cons_args={}, 
                  set_field=default_barcode_field, min_count=default_min_count, max_diversity=None, 
                  out_args=default_out_args, nproc=None, queue_size=None):
    """
    Calculates error rates of sequence sets

    Arguments: 
    seq_file = the sample sequence file name
    cons_func = the function to use for consensus generation 
    cons_args = a dictionary of arguments for the consensus function
    set_field = the annotation field containing set IDs
    min_count = threshold number of sequences to consider a set
    max_diversity = a threshold defining the average pairwise error rate required to retain a read group;
                    if None do not calculate diversity
    out_args = common output argument dictionary from parseCommonArgs
    nproc = the number of processQueue processes;
            if None defaults to the number of CPUs
    queue_size = maximum size of the argument queue;
                 if None defaults to 2*nproc
                    
    Returns: 
    a list of tuples of (position error, quality error, nucleotide pairwise error) output file names
    """
    # Define subcommand label dictionary
    cmd_dict = {frequencyConsensus:'freq', qualityConsensus:'qual'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'EstimateError'
    log['FILE'] = os.path.basename(seq_file)
    log['MODE'] = cmd_dict.get(cons_func, cons_func.__name__)
    log['SET_FIELD'] = set_field
    log['MIN_COUNT'] = min_count
    log['MAX_DIVERSITY'] = max_diversity
    log['NPROC'] = nproc
    printLog(log)
    
    # Check input file type
    in_type = getFileType(seq_file)
    if in_type != 'fastq':  sys.exit('ERROR:  Input file must be FASTQ')
    
        # Define feeder function and arguments
    index_args = {'field': set_field, 'delimiter': out_args['delimiter']}
    feed_func = feedSeqQueue
    feed_args = {'seq_file': seq_file,
                 'index_func': indexSeqSets, 
                 'index_args': index_args}
    # Define worker function and arguments
    work_func = processEEQueue
    work_args = {'cons_func': cons_func, 
                 'cons_args': cons_args,
                 'min_count': min_count,
                 'max_diversity': max_diversity}
    # Define collector function and arguments
    collect_func = collectEEQueue
    collect_args = {'seq_file': seq_file,
                    'out_args': out_args,
                    'set_field': set_field}

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, 
                             feed_args, work_args, collect_args, 
                             nproc, queue_size)
        
    # Print log
    result['log']['END'] = 'EstimateError'
    printLog(result['log'])
        
    return result['out_files']
Exemple #16
0
def collectQueue(alive,
                 result_queue,
                 collect_queue,
                 db_file,
                 out_args,
                 cluster_func=None,
                 cluster_args={}):
    """
    Assembles results from a queue of individual sequence results and manages log/file I/O

    Arguments: 
    alive = a multiprocessing.Value boolean controlling whether processing continues
            if False exit process
    result_queue = a multiprocessing.Queue holding processQueue results
    collect_queue = a multiprocessing.Queue to store collector return values
    db_file = the input database file name
    out_args = common output argument dictionary from parseCommonArgs
    cluster_func = the function to call for carrying out clustering on distance matrix
    cluster_args = a dictionary of arguments to pass to cluster_func
    
    Returns: 
    None
    (adds 'log' and 'out_files' to collect_dict)
    """
    # Open output files
    try:
        # Count records and define output format
        out_type = getFileType(db_file) if out_args['out_type'] is None \
                   else out_args['out_type']
        result_count = countDbFile(db_file)

        # Defined successful output handle
        pass_handle = getOutputHandle(db_file,
                                      out_label='clone-pass',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_type)
        pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE')

        # Defined failed alignment output handle
        if out_args['failed']:
            fail_handle = getOutputHandle(db_file,
                                          out_label='clone-fail',
                                          out_dir=out_args['out_dir'],
                                          out_name=out_args['out_name'],
                                          out_type=out_type)
            fail_writer = getDbWriter(fail_handle, db_file)
        else:
            fail_handle = None
            fail_writer = None

        # Define log handle
        if out_args['log_file'] is None:
            log_handle = None
        else:
            log_handle = open(out_args['log_file'], 'w')
    except:
        #sys.stderr.write('Exception in collector file opening step\n')
        alive.value = False
        raise

    # Get results from queue and write to files
    try:
        #print 'START COLLECT', alive.value
        # Iterator over results queue until sentinel object reached
        start_time = time()
        rec_count = clone_count = pass_count = fail_count = 0
        while alive.value:
            # Get result from queue
            if result_queue.empty(): continue
            else: result = result_queue.get()
            # Exit upon reaching sentinel
            if result is None: break
            #print "COLLECT", alive.value, result['id']

            # Print progress for previous iteration and update record count
            if rec_count == 0:
                print('PROGRESS> Assigning clones')
            printProgress(rec_count, result_count, 0.05, start_time)
            rec_count += len(result.data)

            # Write passed and failed records
            if result:
                # Writing passing sequences
                for clone in result.results.values():
                    clone_count += 1
                    for i, rec in enumerate(clone, start=1):
                        rec.annotations['CLONE'] = clone_count
                        pass_writer.writerow(rec.toDict())
                        pass_count += 1
                        result.log['CLONE%i-%i' % (clone_count, i)] = str(
                            rec.junction)
                # Right failed seqeuence from passing sets
                if result.failed:
                    for i, rec in enumerate(result.failed, start=1):
                        fail_count += 1
                        if fail_writer is not None:
                            fail_writer.writerow(rec.toDict())
                        result.log['FAIL%i-%i' % (clone_count, i)] = str(
                            rec.junction)
            else:
                # Write failed sets
                for i, rec in enumerate(result.data, start=1):
                    fail_count += 1
                    if fail_writer is not None:
                        fail_writer.writerow(rec.toDict())
                    result.log['CLONE0-%i' % (i)] = str(rec.junction)

            # Write log
            printLog(result.log, handle=log_handle)
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None

        # Print total counts
        printProgress(rec_count, result_count, 0.05, start_time)

        # Close file handles
        pass_handle.close()
        if fail_handle is not None: fail_handle.close()
        if log_handle is not None: log_handle.close()

        # Update return list
        log = OrderedDict()
        log['OUTPUT'] = os.path.basename(pass_handle.name)
        log['CLONES'] = clone_count
        log['RECORDS'] = rec_count
        log['PASS'] = pass_count
        log['FAIL'] = fail_count
        collect_dict = {'log': log, 'out_files': [pass_handle.name]}
        collect_queue.put(collect_dict)
    except:
        #sys.stderr.write('Exception in collector result processing step\n')
        alive.value = False
        raise

    return None
Exemple #17
0
def clusterBarcodes(seq_file,
                    ident=default_cluster_ident,
                    length_ratio=default_length_ratio,
                    barcode_field=default_barcode_field,
                    cluster_field=default_cluster_field,
                    cluster_prefix=default_cluster_prefix,
                    cluster_tool=default_cluster_tool,
                    cluster_exec=default_cluster_exec,
                    out_file=None,
                    out_args=default_out_args,
                    nproc=None):
    """
    Performs clustering on sets of sequences

    Arguments:
      seq_file : the sample sequence file name.
      ident : the identity threshold for clustering sequences.
      length_ratio : minimum short/long length ratio allowed within a cluster.
      barcode_field : the annotation field containing barcode sequences.
      cluster_field : the name of the output cluster field.
      cluster_prefix : string defining a prefix for the cluster identifier.
      seq_start : the start position to trim sequences at before clustering.
      seq_end : the end position to trim sequences at before clustering.
      cluster_tool : the clustering tool to use; one of cd-hit or usearch.
      cluster_exec : the path to the executable for usearch.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : output arguments.
      nproc : the number of processQueue processes;
              if None defaults to the number of CPUs.

    Returns:
      str: the clustered output file name
    """

    # Function to modify SeqRecord header with cluster identifier
    def _header(seq,
                cluster,
                field=cluster_field,
                prefix=cluster_prefix,
                delimiter=out_args['delimiter']):
        label = '%s%i' % (prefix, cluster)
        header = parseAnnotation(seq.description, delimiter=delimiter)
        header = mergeAnnotation(header, {field: label}, delimiter=delimiter)
        seq.id = seq.name = flattenAnnotation(header, delimiter=delimiter)
        seq.description = ''
        return seq

    # Function to extract to make SeqRecord object from a barcode annotation
    def _barcode(seq, field=barcode_field, delimiter=out_args['delimiter']):
        header = parseAnnotation(seq.description, delimiter=delimiter)
        return SeqRecord(Seq(header[field]), id=seq.id)

    # Print parameter info
    log = OrderedDict()
    log['START'] = 'ClusterSets'
    log['COMMAND'] = 'barcode'
    log['FILE'] = os.path.basename(seq_file)
    log['IDENTITY'] = ident
    log['BARCODE_FIELD'] = barcode_field
    log['CLUSTER_FIELD'] = cluster_field
    log['CLUSTER_PREFIX'] = cluster_prefix
    log['CLUSTER_TOOL'] = cluster_tool
    log['NPROC'] = nproc
    printLog(log)

    # Set cluster tool
    try:
        cluster_func = map_cluster_tool.get(cluster_tool)
    except:
        printError('Invalid clustering tool %s.' % cluster_tool)

    # Check the minimum identity
    if ident < min_cluster_ident[cluster_tool]:
        printError('Minimum identity %s too low for clustering tool %s.' %
                   (str(ident), cluster_tool))

    # Count sequence file and parse into a list of SeqRecords
    result_count = countSeqFile(seq_file)
    barcode_iter = (_barcode(x) for x in readSeqFile(seq_file))

    # Perform clustering
    start_time = time()
    printMessage('Running %s' % cluster_tool, start_time=start_time, width=25)
    cluster_dict = cluster_func(barcode_iter,
                                ident=ident,
                                length_ratio=length_ratio,
                                seq_start=0,
                                seq_end=None,
                                threads=nproc,
                                cluster_exec=cluster_exec)
    printMessage('Done', start_time=start_time, end=True, width=25)

    # Determine file type
    if out_args['out_type'] is None:
        out_args['out_type'] = getFileType(seq_file)

    # Open output file handles
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(seq_file,
                                      'cluster-pass',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])

    # Open indexed sequence file
    seq_dict = readSeqFile(seq_file, index=True)

    # Iterate over sequence records and update header with cluster annotation
    start_time = time()
    rec_count = pass_count = 0
    for cluster, id_list in cluster_dict.items():
        printProgress(rec_count, result_count, 0.05, start_time=start_time)
        rec_count += len(id_list)

        # TODO:  make a generator. Figure out how to get pass_count updated
        # Define output sequences
        seq_output = [_header(seq_dict[x], cluster) for x in id_list]

        # Write output
        pass_count += len(seq_output)
        SeqIO.write(seq_output, pass_handle, out_args['out_type'])

    # Update progress
    printProgress(rec_count, result_count, 0.05, start_time=start_time)

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['CLUSTERS'] = len(cluster_dict)
    log['SEQUENCES'] = result_count
    log['PASS'] = pass_count
    log['FAIL'] = rec_count - pass_count
    log['END'] = 'ClusterSets'
    printLog(log)

    # Close handles
    pass_handle.close()

    return pass_handle.name
Exemple #18
0
def collectQueueClust(alive, result_queue, collect_queue, db_file, out_args,
                      cluster_func, cluster_args):
    """
    Assembles results from a queue of individual sequence results and manages log/file I/O

    Arguments: 
    alive = a multiprocessing.Value boolean controlling whether processing continues
            if False exit process
    result_queue = a multiprocessing.Queue holding processQueue results
    collect_queue = a multiprocessing.Queue to store collector return values
    db_file = the input database file name
    out_args = common output argument dictionary from parseCommonArgs
    cluster_func = the function to call for carrying out clustering on distance matrix
    cluster_args = a dictionary of arguments to pass to cluster_func
    
    Returns: 
    None
    (adds 'log' and 'out_files' to collect_dict)
    """
    # Open output files
    try:

        # Iterate over Ig records to count and order by junction length
        result_count = 0
        records = {}
        # print 'Reading file...'
        db_iter = readDbFile(db_file)
        for rec in db_iter:
            records[rec.id] = rec
            result_count += 1
        records = OrderedDict(
            sorted(list(records.items()), key=lambda i: i[1].junction_length))

        # Define empty matrix to store assembled results
        dist_mat = np.zeros((result_count, result_count))

        # Count records and define output format
        out_type = getFileType(db_file) if out_args['out_type'] is None \
                   else out_args['out_type']

        # Defined successful output handle
        pass_handle = getOutputHandle(db_file,
                                      out_label='clone-pass',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_type)
        pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE')

        # Defined failed cloning output handle
        if out_args['failed']:
            fail_handle = getOutputHandle(db_file,
                                          out_label='clone-fail',
                                          out_dir=out_args['out_dir'],
                                          out_name=out_args['out_name'],
                                          out_type=out_type)
            fail_writer = getDbWriter(fail_handle, db_file)
        else:
            fail_handle = None
            fail_writer = None

        # Open log file
        if out_args['log_file'] is None:
            log_handle = None
        else:
            log_handle = open(out_args['log_file'], 'w')
    except:
        alive.value = False
        raise

    try:
        # Iterator over results queue until sentinel object reached
        start_time = time()
        row_count = rec_count = 0
        while alive.value:
            # Get result from queue
            if result_queue.empty(): continue
            else: result = result_queue.get()
            # Exit upon reaching sentinel
            if result is None: break

            # Print progress for previous iteration
            if row_count == 0:
                print('PROGRESS> Assigning clones')
            printProgress(row_count, result_count, 0.05, start_time)

            # Update counts for iteration
            row_count += 1
            rec_count += len(result)

            # Add result row to distance matrix
            if result:
                dist_mat[list(range(result_count - len(result), result_count)),
                         result_count - len(result)] = result.results

        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None

        # Calculate linkage and carry out clustering
        # print dist_mat
        clusters = cluster_func(dist_mat, **
                                cluster_args) if dist_mat is not None else None
        clones = {}
        # print clusters
        for i, c in enumerate(clusters):
            clones.setdefault(c, []).append(records[list(records.keys())[i]])

        # Write passed and failed records
        clone_count = pass_count = fail_count = 0
        if clones:
            for clone in clones.values():
                clone_count += 1
                for i, rec in enumerate(clone):
                    rec.annotations['CLONE'] = clone_count
                    pass_writer.writerow(rec.toDict())
                    pass_count += 1
                    #result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction)

        else:
            for i, rec in enumerate(result.data):
                fail_writer.writerow(rec.toDict())
                fail_count += 1
                #result.log['CLONE0-%i' % (i + 1)] = str(rec.junction)

        # Print final progress
        printProgress(row_count, result_count, 0.05, start_time)

        # Close file handles
        pass_handle.close()
        if fail_handle is not None: fail_handle.close()
        if log_handle is not None: log_handle.close()

        # Update return list
        log = OrderedDict()
        log['OUTPUT'] = os.path.basename(pass_handle.name)
        log['CLONES'] = clone_count
        log['RECORDS'] = rec_count
        log['PASS'] = pass_count
        log['FAIL'] = fail_count
        collect_dict = {'log': log, 'out_files': [pass_handle.name]}
        collect_queue.put(collect_dict)
    except:
        alive.value = False
        raise

    return None
Exemple #19
0
def collectSeqQueue(alive,
                    result_queue,
                    collect_queue,
                    seq_file,
                    label,
                    index_field=None,
                    out_file=None,
                    out_args=default_out_args):
    """
    Pulls from results queue, assembles results and manages log and file IO

    Arguments:
      alive : a multiprocessing.Value boolean controlling whether processing
              continues; when False function returns.
      result_queue : Multiprocessing.Queue holding worker results.
      collect_queue : Multiprocessing.Queue to store collector return values.
      seq_file : sample sequence file name.
      label : task label used to tag the output files.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : Common output argument dictionary from parseCommonArgs.
      index_field : Field defining set membership for sequence sets
                    if None data queue contained individual records.

    Returns:
      None: Adds a dictionary with key value pairs to collect_queue containing
           'log' defining a log object,
           'out_files' defining the output file names
    """
    # Define output format
    out_type = getFileType(seq_file) if out_args['out_type'] is None \
               else out_args['out_type']

    # Wrapper for opening handles and writers
    def _open(x, label=label, out_file=out_file):
        if out_file is not None and x == 'pass':
            handle = open(out_file, 'w')
        else:
            handle = getOutputHandle(seq_file,
                                     out_label='%s-%s' % (label, x),
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type=out_type)
        return handle

    try:
        # Count records
        if index_field is None:
            result_count = countSeqFile(seq_file)
        else:
            result_count = countSeqSets(seq_file, index_field,
                                        out_args['delimiter'])

        # Define log handle
        if out_args['log_file'] is None:
            log_handle = None
        else:
            log_handle = open(out_args['log_file'], 'w')
    except:
        alive.value = False
        raise

    try:
        # Initialize output handles
        pass_handle, fail_handle = None, None

        # Iterator over results queue until sentinel object reached
        start_time = time()
        set_count = seq_count = pass_count = fail_count = 0
        while alive.value:
            # Get result from queue
            if result_queue.empty(): continue
            else: result = result_queue.get()
            # Exit upon reaching sentinel
            if result is None: break

            # Print progress for previous iteration
            printProgress(set_count, result_count, 0.05, start_time=start_time)

            # Update counts for current iteration
            set_count += 1
            seq_count += result.data_count

            # Write log
            printLog(result.log, handle=log_handle)

            # Write records
            if result:
                pass_count += 1
                try:
                    SeqIO.write(result.results, pass_handle, out_type)
                except AttributeError:
                    # Open pass file
                    pass_handle = _open('pass')
                    SeqIO.write(result.results, pass_handle, out_type)
            else:
                fail_count += 1
                if out_args['failed']:
                    try:
                        SeqIO.write(result.data, fail_handle, out_type)
                    except AttributeError:
                        # Open fail file
                        fail_handle = _open('fail')
                        SeqIO.write(result.data, fail_handle, out_type)
        else:
            sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None

        # Print total counts
        printProgress(set_count, result_count, 0.05, start_time=start_time)

        # Update return values
        log = OrderedDict()
        log['OUTPUT'] = os.path.basename(
            pass_handle.name) if pass_handle is not None else None
        log['SEQUENCES'] = seq_count
        if index_field is not None:
            log['SETS'] = set_count
        log['PASS'] = pass_count
        log['FAIL'] = fail_count

        # Close file handles and generate return data
        #collect_dict = {'log': log, 'pass': None, 'fail': None}
        collect_dict = {'log': log, 'out_files': []}
        if pass_handle is not None:
            #collect_dict['pass'] = pass_handle.name
            collect_dict['out_files'].append(pass_handle.name)
            pass_handle.close()
        if fail_handle is not None:
            #collect_dict['fail'] = fail_handle.name
            collect_dict['out_files'].append(fail_handle.name)
            fail_handle.close()
        if log_handle is not None:
            log_handle.close()
        collect_queue.put(collect_dict)
    except:
        alive.value = False
        raise

    return None
Exemple #20
0
def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None,
                copy_fields=None, copy_actions=None, max_field=None, min_field=None, 
                inner=False, keep_missing=False, out_file=None, out_args=default_out_args):
    """
    Removes duplicate sequences from a file

    Arguments: 
      seq_file : filename of the sequence file to sample from.
      max_missing : number of ambiguous characters to allow in a unique sequence.
      uniq_fields : a list of annotations that define a sequence as unique if they differ.
      copy_fields : a list of annotations to copy into unique sequence annotations.
      copy_actions : the list of collapseAnnotation actions to take on copy_fields.
      max_field : a numeric field whose maximum value determines the retained sequence.
      min_field : a numeric field whose minimum value determines the retained sequence.
      inner : if True exclude consecutive outer ambiguous characters from iterations and matching.
      keep_missing : if True retain sequences with more ambiguous characters than max_missing as unique.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
              
    Returns: 
      str: the collapsed output file name.
    """
    log = OrderedDict()
    log['START'] = 'CollapseSeq'
    log['FILE'] = os.path.basename(seq_file)
    log['MAX_MISSING'] = max_missing
    log['UNIQ_FIELDS'] = ','.join([str(x) for x in uniq_fields]) \
                         if uniq_fields is not None else None
    log['COPY_FIELDS'] = ','.join([str(x) for x in copy_fields]) \
                         if copy_fields is not None else None
    log['COPY_ACTIONS'] = ','.join([str(x) for x in copy_actions]) \
                          if copy_actions is not None else None
    log['MAX_FIELD'] = max_field
    log['MIN_FIELD'] = min_field
    log['INNER'] = inner
    log['KEEP_MISSING'] = keep_missing
    printLog(log)
    
    # Read input file
    in_type = getFileType(seq_file)
    seq_dict = SeqIO.to_dict(readSeqFile(seq_file, index=False))
    if out_args['out_type'] is None:  out_args['out_type'] = in_type

    # Count total sequences
    rec_count = len(seq_dict)

    # Open unique record output handle
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(seq_file,
                                      'collapse-unique',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
    # Define log handle
    if out_args['log_file'] is None:  
        log_handle = None
    else:  
        log_handle = open(out_args['log_file'], 'w')

    # Find sequences with duplicates
    uniq_dict = {}
    # Added list typing for compatibility issue with Python 2.7.5 on OS X
    # TypeError: object of type 'dictionary-keyiterator' has no len()
    search_keys = list(seq_dict.keys())
    dup_keys = []
    for n in range(0, max_missing + 1):
        # Find unique sequences
        uniq_dict, search_keys, dup_list = findUniqueSeq(uniq_dict, search_keys, seq_dict, n, 
                                                         uniq_fields, copy_fields,
                                                         max_field, min_field, inner, 
                                                         out_args['delimiter'])

        # Update list of duplicates
        dup_keys.extend(dup_list)
                
        # Break if no keys to search remain
        if len(search_keys) == 0:  break

    # Write unique sequences
    for val in uniq_dict.values():
        # Define output sequence
        out_seq = val.seq
        out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter'])
        out_app = OrderedDict()
        if copy_fields  is not None and copy_actions is not None:
            for f, a in zip(copy_fields, copy_actions):
                x = collapseAnnotation(val.annotations, a, f, delimiter=out_args['delimiter'])
                out_app[f] = x[f]
                out_ann.pop(f, None)
        out_app['DUPCOUNT'] = val.count
        out_ann = mergeAnnotation(out_ann, out_app, delimiter=out_args['delimiter'])
        out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter'])
        out_seq.description = ''
        # Write unique sequence
        SeqIO.write(out_seq, pass_handle, out_args['out_type'])

        # Update log
        log = OrderedDict()
        log['HEADER'] = out_seq.id
        log['DUPCOUNT'] = val.count
        for i, k in enumerate(val.keys, start=1):
            log['ID%i' % i] = k
        for i, k in enumerate(val.keys, start=1):
            log['SEQ%i' % i] = str(seq_dict[k].seq)
        printLog(log, handle=log_handle)

    # Write sequence with high missing character counts
    if keep_missing:
        for k in search_keys:
            out_seq = seq_dict[k]
            out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter'])
            out_ann = mergeAnnotation(out_ann, {'DUPCOUNT':1}, delimiter=out_args['delimiter'])
            out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter'])
            out_seq.description = ''
            SeqIO.write(out_seq, pass_handle, out_args['out_type'])

    # Write sequence with high missing character counts
    if out_args['failed'] and not keep_missing:
        with getOutputHandle(seq_file, 'collapse-undetermined', out_dir=out_args['out_dir'],
                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
                as missing_handle:
            for k in search_keys:
                SeqIO.write(seq_dict[k], missing_handle, out_args['out_type'])

    if out_args['failed']:
        # Write duplicate sequences 
        with getOutputHandle(seq_file, 'collapse-duplicate', out_dir=out_args['out_dir'], 
                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
                as dup_handle:
            for k in dup_keys:
                SeqIO.write(seq_dict[k], dup_handle, out_args['out_type'])

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['SEQUENCES'] = rec_count
    log['UNIQUE'] = len(uniq_dict)
    log['DUPLICATE'] = len(dup_keys)
    log['UNDETERMINED'] = len(search_keys)
    log['END'] = 'CollapseSeq'
    printLog(log)
        
    # Close file handles
    pass_file = pass_handle.name
    if pass_handle is not None:  pass_handle.close()
    if log_handle is not None:  log_handle.close()
    
    return pass_file
Exemple #21
0
def groupSeqFile(seq_file, field, threshold=None, out_args=default_out_args):
    """
    Divides a sequence file into segments by description tags

    Arguments: 
      seq_file : filename of the sequence file to split
      field : The annotation field to split seq_file by
      threshold : The numerical threshold for group sequences by;
                  if None treat field as textual
      out_args : common output argument dictionary from parseCommonArgs

    Returns: 
      list: output file names
    """
    log = OrderedDict()
    log['START'] = 'SplitSeq'
    log['COMMAND'] = 'group'
    log['FILE'] = os.path.basename(seq_file)
    log['FIELD'] = field
    log['THRESHOLD'] = threshold
    printLog(log)

    # Open file handles
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None: out_args['out_type'] = in_type

    # Determine total numbers of records
    rec_count = countSeqFile(seq_file)

    # Process sequences
    start_time = time()
    seq_count = 0
    if threshold is None:
        # Sort records into files based on textual field
        # Create set of unique field tags
        temp_iter = readSeqFile(seq_file)
        tag_list = getAnnotationValues(temp_iter,
                                       field,
                                       unique=True,
                                       delimiter=out_args['delimiter'])

        if sys.platform != 'win32':
            import resource
            # Increase open file handle limit if needed
            file_limit = resource.getrlimit(resource.RLIMIT_NOFILE)[0]
            file_count = len(tag_list) + 256
            if file_limit < file_count and file_count <= 8192:
                #print file_limit, file_count
                resource.setrlimit(resource.RLIMIT_NOFILE,
                                   (file_count, file_count))
            elif file_count > 8192:
                e = '''OS file limit would need to be set to %i.
                    If you are sure you want to do this, then increase the 
                    file limit in the OS (via ulimit) and rerun this tool.
                    ''' % file_count
                printError(dedent(e))

        # Create output handles
        # out_label = '%s=%s' % (field, tag)
        handles_dict = {
            tag: getOutputHandle(seq_file,
                                 '%s-%s' % (field, tag),
                                 out_dir=out_args['out_dir'],
                                 out_name=out_args['out_name'],
                                 out_type=out_args['out_type'])
            for tag in tag_list
        }

        # Iterate over sequences
        for seq in seq_iter:
            printProgress(seq_count, rec_count, 0.05, start_time=start_time)
            seq_count += 1
            # Write sequences
            tag = parseAnnotation(seq.description,
                                  delimiter=out_args['delimiter'])[field]
            SeqIO.write(seq, handles_dict[tag], out_args['out_type'])
    else:
        # Sort records into files based on numeric threshold
        threshold = float(threshold)
        # Create output handles
        handles_dict = {
            'under':
            getOutputHandle(seq_file,
                            'under-%.1g' % threshold,
                            out_dir=out_args['out_dir'],
                            out_name=out_args['out_name'],
                            out_type=out_args['out_type']),
            'atleast':
            getOutputHandle(seq_file,
                            'atleast-%.1g' % threshold,
                            out_dir=out_args['out_dir'],
                            out_name=out_args['out_name'],
                            out_type=out_args['out_type'])
        }

        # Iterate over sequences
        for seq in seq_iter:
            printProgress(seq_count, rec_count, 0.05, start_time=start_time)
            seq_count += 1
            # Write sequences
            tag = parseAnnotation(seq.description,
                                  delimiter=out_args['delimiter'])[field]
            tag = 'under' if float(tag) < threshold else 'atleast'
            SeqIO.write(seq, handles_dict[tag], out_args['out_type'])

    # Print log
    printProgress(seq_count, rec_count, 0.05, start_time=start_time)
    log = OrderedDict()
    for i, k in enumerate(handles_dict):
        log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name)
    log['SEQUENCES'] = rec_count
    log['PARTS'] = len(handles_dict)
    log['END'] = 'SplitSeq'
    printLog(log)

    # Close output file handles
    for k in handles_dict:
        handles_dict[k].close()

    return [handles_dict[k].name for k in handles_dict]
def buildConsensus(seq_file, barcode_field=default_barcode_field, 
                   min_count=default_min_count, min_freq=default_min_freq,
                   min_qual=default_min_qual, primer_field=None, primer_freq=None,
                   max_gap=None, max_error=None, max_diversity=None,
                   copy_fields=None, copy_actions=None, dependent=False,
                   out_args=default_out_args, nproc=None, queue_size=None):
    """
    Generates consensus sequences

    Arguments: 
    seq_file = the sample sequence file name
    barcode_field = the annotation field containing set IDs
    min_count = threshold number of sequences to define a consensus 
    min_freq = the frequency cutoff to assign a base 
    min_qual = the quality cutoff to assign a base
    primer_field = the annotation field containing primer tags;
                   if None do not annotate with primer tags
    primer_freq = the maximum primer tag frequency that must be meet to build a consensus;
                  if None do not filter by primer frequency
    max_gap = the maximum frequency of (., -) characters allowed before
               deleting a position; if None do not delete positions
    max_error = a threshold defining the maximum allowed error rate to retain a read group;
                if None do not calculate error rate
    max_diversity = a threshold defining the average pairwise error rate required to retain a read group;
                    if None do not calculate diversity
    dependent = if False treat barcode group sequences as independent data
    copy_fields = a list of annotations to copy into consensus sequence annotations;
                  if None no additional annotations will be copied
    copy_actions = the list of actions to take for each copy_fields;
                   one of ['set', 'majority', 'min', 'max', 'sum']
    out_args = common output argument dictionary from parseCommonArgs
    nproc = the number of processQueue processes;
            if None defaults to the number of CPUs
    queue_size = maximum size of the argument queue;
                 if None defaults to 2*nproc
                    
    Returns: 
    a list of successful output file names
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'BuildConsensus'
    log['FILE'] = os.path.basename(seq_file)
    log['BARCODE_FIELD'] = barcode_field
    log['MIN_COUNT'] = min_count
    log['MIN_FREQUENCY'] = min_freq
    log['MIN_QUALITY'] = min_qual
    log['MAX_GAP'] = max_gap
    log['PRIMER_FIELD'] = primer_field
    log['PRIMER_FREQUENCY'] = primer_freq
    log['MAX_ERROR'] = max_error
    log['MAX_DIVERSITY'] = max_diversity
    log['DEPENDENT'] = dependent
    log['COPY_FIELDS'] = ','.join(copy_fields) if copy_fields is not None else None
    log['COPY_ACTIONS'] = ','.join(copy_actions) if copy_actions is not None else None
    log['NPROC'] = nproc
    printLog(log)
    
    # Set consensus building function
    in_type = getFileType(seq_file)
    if in_type == 'fastq':
        cons_func = qualityConsensus
        cons_args = {'min_qual': min_qual, 
                     'min_freq': min_freq,
                     'dependent': dependent}
    elif in_type == 'fasta':  
        cons_func = frequencyConsensus
        cons_args = {'min_freq': min_freq}
    else:
        sys.exit('ERROR:  Input file must be FASTA or FASTQ')
    
    # Define feeder function and arguments
    index_args = {'field': barcode_field, 'delimiter': out_args['delimiter']}
    feed_func = feedSeqQueue
    feed_args = {'seq_file': seq_file,
                 'index_func': indexSeqSets, 
                 'index_args': index_args}
    # Define worker function and arguments
    work_func = processBCQueue
    work_args = {'cons_func': cons_func, 
                 'cons_args': cons_args,
                 'min_count': min_count,
                 'primer_field': primer_field,
                 'primer_freq': primer_freq,
                 'max_gap': max_gap,
                 'max_error': max_error,
                 'max_diversity': max_diversity,
                 'copy_fields': copy_fields,
                 'copy_actions': copy_actions,
                 'delimiter': out_args['delimiter']}
    # Define collector function and arguments
    collect_func = collectSeqQueue
    collect_args = {'seq_file': seq_file,
                    'task_label': 'consensus',
                    'out_args': out_args,
                    'index_field': barcode_field}
    
    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, 
                             feed_args, work_args, collect_args, 
                             nproc, queue_size)
        
    # Print log
    result['log']['END'] = 'BuildConsensus'
    printLog(result['log'])
        
    return result['out_files']
Exemple #23
0
def downsizeSeqFile(seq_file, max_count, out_args=default_out_args):
    """
    Splits a FASTA/FASTQ file into segments with a limited number of records

    Arguments: 
      seq_file : filename of the FASTA file to split
      max_count : number of records in each output file
      out_args : common output argument dictionary from parseCommonArgs

    Returns: 
      list: output file names
    """
    log = OrderedDict()
    log['START'] = 'SplitSeq'
    log['COMMAND'] = 'count'
    log['FILE'] = os.path.basename(seq_file)
    log['MAX_COUNT'] = max_count
    printLog(log)

    # Open file handles
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None: out_args['out_type'] = in_type
    # Determine total numbers of records
    rec_count = countSeqFile(seq_file)

    # Loop through iterator writing each record and opening new output handle as needed
    start_time = time()
    seq_count, part_num = 0, 1
    out_handle = getOutputHandle(seq_file,
                                 'part%06i' % part_num,
                                 out_dir=out_args['out_dir'],
                                 out_name=out_args['out_name'],
                                 out_type=out_args['out_type'])
    out_files = [out_handle.name]
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, rec_count, 0.05, start_time=start_time)

        # Update count
        seq_count += 1

        # Write records
        SeqIO.write(seq, out_handle, out_args['out_type'])
        # Break if total records reached to avoid extra empty file
        if seq_count == rec_count:
            break

        # Open new file if needed
        if seq_count % max_count == 0:
            out_handle.close()
            part_num += 1
            out_handle = getOutputHandle(seq_file,
                                         'part%06i' % part_num,
                                         out_dir=out_args['out_dir'],
                                         out_name=out_args['out_name'],
                                         out_type=out_args['out_type'])
            out_files.append(out_handle.name)

    # Print log
    printProgress(seq_count, rec_count, 0.05, start_time=start_time)
    log = OrderedDict()
    for i, f in enumerate(out_files):
        log['OUTPUT%i' % (i + 1)] = os.path.basename(f)
    log['SEQUENCES'] = rec_count
    log['PARTS'] = len(out_files)
    log['END'] = 'SplitSeq'
    printLog(log)

    # Close file handles
    out_handle.close()

    return out_files
Exemple #24
0
def estimateSets(seq_file,
                 cons_func=frequencyConsensus,
                 cons_args={},
                 set_field=default_barcode_field,
                 min_count=default_min_count,
                 max_diversity=None,
                 out_args=default_out_args,
                 nproc=None,
                 queue_size=None):
    """
    Calculates error rates of sequence sets

    Arguments: 
      seq_file : the sample sequence file name
      cons_func : the function to use for consensus generation 
      cons_args : a dictionary of arguments for the consensus function
      set_field : the annotation field containing set IDs
      min_count : threshold number of sequences to consider a set
      max_diversity : a threshold defining the average pairwise error rate required to retain a read group;
                    if None do not calculate diversity
      out_args : common output argument dictionary from parseCommonArgs
      nproc : the number of processQueue processes;
            if None defaults to the number of CPUs
      queue_size : maximum size of the argument queue;
                 if None defaults to 2*nproc
                    
    Returns: 
      tuple : (position error, quality error, nucleotide pairwise error) output file names
    """
    # Define subcommand label dictionary
    cmd_dict = {frequencyConsensus: 'freq', qualityConsensus: 'qual'}

    # Print parameter info
    log = OrderedDict()
    log['START'] = 'EstimateError'
    log['FILE'] = os.path.basename(seq_file)
    log['MODE'] = cmd_dict.get(cons_func, cons_func.__name__)
    log['SET_FIELD'] = set_field
    log['MIN_COUNT'] = min_count
    log['MAX_DIVERSITY'] = max_diversity
    log['NPROC'] = nproc
    printLog(log)

    # Check input file type
    in_type = getFileType(seq_file)
    if in_type != 'fastq':
        printError('Input file must be FASTQ.')

    # Define feeder function and arguments
    index_args = {'field': set_field, 'delimiter': out_args['delimiter']}
    feed_func = feedSeqQueue
    feed_args = {
        'seq_file': seq_file,
        'index_func': indexSeqSets,
        'index_args': index_args
    }
    # Define worker function and arguments
    work_func = processEEQueue
    work_args = {
        'cons_func': cons_func,
        'cons_args': cons_args,
        'min_count': min_count,
        'max_diversity': max_diversity
    }
    # Define collector function and arguments
    collect_func = collectEEQueue
    collect_args = {
        'seq_file': seq_file,
        'out_args': out_args,
        'set_field': set_field
    }

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, feed_args,
                             work_args, collect_args, nproc, queue_size)

    # Print log
    result['log']['END'] = 'EstimateError'
    printLog(result['log'])

    return result['out_files']
Exemple #25
0
def sortSeqFile(seq_file,
                field,
                numeric=False,
                max_count=None,
                out_args=default_out_args):
    """
    Sorts a sequence file by annotation fields

    Arguments: 
      seq_file : filename of the sequence file to split
      field : position of field in sequence description to split by
      numeric : if True sort field numerically;
                if False sort field alphabetically
      max_count : maximum number of records in each output file
                  if None do not create multiple files
      out_args : common output argument dictionary from parseCommonArgs
    
    Returns: 
      list: output file names
    """
    log = OrderedDict()
    log['START'] = 'SplitSeq'
    log['COMMAND'] = 'sort'
    log['FILE'] = os.path.basename(seq_file)
    log['FIELD'] = field
    log['NUMERIC'] = numeric
    log['MAX_COUNT'] = max_count
    printLog(log)

    # Open file handles
    in_type = getFileType(seq_file)
    seq_dict = readSeqFile(seq_file, index=True)
    if out_args['out_type'] is None: out_args['out_type'] = in_type

    # Get annotations and sort seq_dict by annotation values
    tag_dict = {
        k: parseAnnotation(seq_dict[k].description,
                           delimiter=out_args['delimiter'])[field]
        for k in seq_dict
    }
    if numeric: tag_dict = {k: float(v or 0) for k, v in tag_dict.items()}
    sorted_keys = sorted(tag_dict, key=tag_dict.get)

    # Determine total numbers of records
    rec_count = len(seq_dict)
    if max_count >= rec_count: max_count = None

    # Open initial output file handles
    file_count = 1
    if max_count is None: out_label = 'sorted'
    else: out_label = 'sorted-part%06i' % file_count
    out_handle = getOutputHandle(seq_file,
                                 out_label,
                                 out_dir=out_args['out_dir'],
                                 out_name=out_args['out_name'],
                                 out_type=out_args['out_type'])
    out_files = [out_handle.name]

    # Loop through sorted sequence dictionary keys
    start_time = time()
    last_tag = None
    saved_keys = []
    seq_count = chunk_count = 0
    for key in sorted_keys:
        # Print progress for previous iteration and update count
        printProgress(seq_count, rec_count, 0.05, start_time=start_time)
        seq_count += 1

        # Write saved group of sequences when tag changes
        if last_tag is not None and tag_dict[key] != last_tag:
            # Open new output file if needed
            if max_count is not None and chunk_count + len(
                    saved_keys) > max_count:
                # Update partition counts
                file_count += 1
                chunk_count = 0
                # Open new file handle
                out_handle.close()
                out_handle = getOutputHandle(seq_file,
                                             'sorted-part%06i' % file_count,
                                             out_dir=out_args['out_dir'],
                                             out_name=out_args['out_name'],
                                             out_type=out_args['out_type'])
                # Append output file name to out_files
                out_files.append(out_handle.name)

            # Write saved sequences
            for k in saved_keys:
                chunk_count += 1
                SeqIO.write(seq_dict[k], out_handle, out_args['out_type'])
            # Reset saved keys to current key only
            saved_keys = [key]
        else:
            # Update list of saved keys if tag is unchanged
            saved_keys.append(key)

        # Check if total records reached, write all saved keys, and exit loop
        if seq_count == rec_count:
            for k in saved_keys:
                chunk_count += 1
                SeqIO.write(seq_dict[k], out_handle, out_args['out_type'])
            out_handle.close()
            break

        # Update tag tracker
        last_tag = tag_dict[key]

    # Print log
    printProgress(seq_count, rec_count, 0.05, start_time=start_time)
    log = OrderedDict()
    for i, f in enumerate(out_files):
        log['OUTPUT%i' % (i + 1)] = os.path.basename(f)
    log['SEQUENCES'] = seq_count
    log['PARTS'] = len(out_files)
    log['END'] = 'SplitSeq'
    printLog(log)

    # Close file handles
    out_handle.close()

    return out_files
Exemple #26
0
def buildConsensus(seq_file,
                   barcode_field=default_barcode_field,
                   min_count=default_consensus_min_count,
                   min_freq=default_consensus_min_freq,
                   min_qual=default_consensus_min_qual,
                   primer_field=None,
                   primer_freq=None,
                   max_gap=None,
                   max_error=None,
                   max_diversity=None,
                   copy_fields=None,
                   copy_actions=None,
                   dependent=False,
                   out_file=None,
                   out_args=default_out_args,
                   nproc=None,
                   queue_size=None):
    """
    Generates consensus sequences

    Arguments: 
      seq_file : the sample sequence file name
      barcode_field : the annotation field containing set IDs
      min_count : threshold number of sequences to define a consensus
      min_freq : the frequency cutoff to assign a base
      min_qual : the quality cutoff to assign a base
      primer_field : the annotation field containing primer tags;
                     if None do not annotate with primer tags
      primer_freq : the maximum primer tag frequency that must be meet to build a consensus;
                    if None do not filter by primer frequency
      max_gap : the maximum frequency of (., -) characters allowed before
                deleting a position; if None do not delete positions
      max_error : a threshold defining the maximum allowed error rate to retain a read group;
                  if None do not calculate error rate
      max_diversity : a threshold defining the average pairwise error rate required to retain a read group;
                      if None do not calculate diversity
      dependent : if False treat barcode group sequences as independent data
      copy_fields : a list of annotations to copy into consensus sequence annotations;
                    if None no additional annotations will be copied
      copy_actions : the list of actions to take for each copy_fields;
                     one of ['set', 'majority', 'min', 'max', 'sum']
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
      nproc : the number of processQueue processes;
            if None defaults to the number of CPUs
      queue_size : maximum size of the argument queue;
                 if None defaults to 2*nproc
                    
    Returns: 
      list : a list of successful output file names.
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'BuildConsensus'
    log['FILE'] = os.path.basename(seq_file)
    log['BARCODE_FIELD'] = barcode_field
    log['MIN_COUNT'] = min_count
    log['MIN_FREQUENCY'] = min_freq
    log['MIN_QUALITY'] = min_qual
    log['MAX_GAP'] = max_gap
    log['PRIMER_FIELD'] = primer_field
    log['PRIMER_FREQUENCY'] = primer_freq
    log['MAX_ERROR'] = max_error
    log['MAX_DIVERSITY'] = max_diversity
    log['DEPENDENT'] = dependent
    log['COPY_FIELDS'] = ','.join(
        copy_fields) if copy_fields is not None else None
    log['COPY_ACTIONS'] = ','.join(
        copy_actions) if copy_actions is not None else None
    log['NPROC'] = nproc
    printLog(log)

    # Set consensus building function
    in_type = getFileType(seq_file)
    if in_type == 'fastq':
        cons_func = qualityConsensus
        cons_args = {
            'min_qual': min_qual,
            'min_freq': min_freq,
            'dependent': dependent
        }
    elif in_type == 'fasta':
        cons_func = frequencyConsensus
        cons_args = {'min_freq': min_freq}
    else:
        printError('Input file must be FASTA or FASTQ.')

    # Define feeder function and arguments
    index_args = {'field': barcode_field, 'delimiter': out_args['delimiter']}
    feed_func = feedSeqQueue
    feed_args = {
        'seq_file': seq_file,
        'index_func': indexSeqSets,
        'index_args': index_args
    }
    # Define worker function and arguments
    work_func = processQueue
    work_args = {
        'cons_func': cons_func,
        'cons_args': cons_args,
        'min_count': min_count,
        'primer_field': primer_field,
        'primer_freq': primer_freq,
        'max_gap': max_gap,
        'max_error': max_error,
        'max_diversity': max_diversity,
        'copy_fields': copy_fields,
        'copy_actions': copy_actions,
        'delimiter': out_args['delimiter']
    }
    # Define collector function and arguments
    collect_func = collectSeqQueue
    collect_args = {
        'seq_file': seq_file,
        'label': 'consensus',
        'out_file': out_file,
        'out_args': out_args,
        'index_field': barcode_field
    }

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, feed_args,
                             work_args, collect_args, nproc, queue_size)

    # Print log
    result['log']['END'] = 'BuildConsensus'
    printLog(result['log'])

    return result['out_files']
def modifyHeaders(seq_file, modify_func, modify_args, out_args=default_out_args):
    """
    Modifies sequence headers

    Arguments: 
    seq_file = the sequence file name
    modify_func = the function defining the modification operation
    modify_args = a dictionary of arguments to pass to modify_func
    out_args = common output argument dictionary from parseCommonArgs
                    
    Returns: 
    the output file name
    """
    # Define subcommand label dictionary
    cmd_dict = {addHeader: 'add',
                copyHeader: 'copy',
                collapseHeader: 'collapse',
                deleteHeader: 'delete',
                expandHeader: 'expand',
                renameHeader: 'rename'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'ParseHeaders'
    log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__)
    log['FILE'] = os.path.basename(seq_file)
    for k in sorted(modify_args):  
        v = modify_args[k]
        log[k.upper()] = ','.join(v) if isinstance(v, list) else v
    printLog(log)
    
    # Open file handles
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type
    out_handle = getOutputHandle(seq_file, 'reheader', out_dir=out_args['out_dir'],
                                 out_name=out_args['out_name'], out_type=out_args['out_type'])

    # Count records
    result_count = countSeqFile(seq_file)
    
    # Iterate over sequences
    start_time = time()
    seq_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, result_count, 0.05, start_time)
        
        #Update counts
        seq_count += 1
        
        # Modify header
        header = parseAnnotation(seq.description, delimiter=out_args['delimiter'])
        header = modify_func(header, delimiter=out_args['delimiter'], **modify_args)
        
        # Write new sequence
        seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter'])
        seq.description = ''
        SeqIO.write(seq, out_handle, out_args['out_type'])
        
    # print counts
    printProgress(seq_count, result_count, 0.05, start_time)    
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['SEQUENCES'] = seq_count
    log['END'] = 'ParseHeaders'               
    printLog(log)

    # Close file handles
    out_handle.close()
 
    return out_handle.name
Exemple #28
0
def convertHeaders(seq_file, convert_func, convert_args={}, out_file=None, out_args=default_out_args):
    """
    Converts sequence headers to the pRESTO format

    Arguments:
      seq_file : the sequence file name.
      convert_func : the function used to convert sequence headers.
      convert_args : a dictionary of arguments to pass to convert_func.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      str: the output sequence file name.
    """
    # Define subcommand label dictionary
    cmd_dict = {convertGenericHeader: 'generic',
                convert454Header: '454',
                convertGenbankHeader: 'genbank',
                convertIlluminaHeader: 'illumina',
                convertIMGTHeader: 'imgt',
                convertMIGECHeader: 'migec',
                convertSRAHeader: 'sra'}

    log = OrderedDict()
    log['START'] = 'ConvertHeaders'
    log['COMMAND'] = cmd_dict[convert_func]
    log['FILE'] = os.path.basename(seq_file)
    printLog(log)

    # Open input file
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type

    # Wrapper for opening handles and writers
    def _open(x, out_file=out_file):
        if out_file is not None and x == 'pass':
            handle = open(out_file, 'w')
        else:
            handle = getOutputHandle(seq_file,
                                     'convert-%s' % x,
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type=out_args['out_type'])
        return handle

    # Count records
    result_count = countSeqFile(seq_file)

    # Set additional conversion arguments
    if convert_func in [convertGenericHeader, convertGenbankHeader]:
        convert_args.update({'delimiter': out_args['delimiter']})

    # Intialize file handles
    pass_handle, fail_handle = None, None

    # Iterate over sequences
    start_time = time()
    seq_count = pass_count = fail_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration and update count
        printProgress(seq_count, result_count, 0.05, start_time=start_time)
        seq_count += 1

        # Convert header
        header = convert_func(seq.description, **convert_args)

        if header is not None:
            # Write successfully converted sequences
            pass_count += 1
            seq.id = seq.name = flattenAnnotation(header, out_args['delimiter'])
            seq.description = ''
            try:
                SeqIO.write(seq, pass_handle, out_args['out_type'])
            except AttributeError:
                # Open output file
                pass_handle = _open('pass')
                SeqIO.write(seq, pass_handle, out_args['out_type'])
        else:
            fail_count += 1
            if out_args['failed']:
                # Write unconverted sequences
                try:
                    SeqIO.write(seq, fail_handle, out_args['out_type'])
                except AttributeError:
                    # Open output file
                    pass_handle = _open('fail')
                    SeqIO.write(seq, fail_handle, out_args['out_type'])

    # Print counts
    printProgress(seq_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None
    log['SEQUENCES'] = seq_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'ConvertHeaders'
    printLog(log)

    # Close file handles
    if fail_handle is not None:  pass_handle.close()
    if fail_handle is not None:  fail_handle.close()

    return pass_handle.name
Exemple #29
0
def filterSeq(seq_file,
              filter_func,
              filter_args={},
              out_file=None,
              out_args=default_out_args,
              nproc=None,
              queue_size=None):
    """
    Filters sequences by fraction of ambiguous nucleotides
    
    Arguments: 
      seq_file : the sequence file to filter.
      filter_func : the function to use for filtering sequences.
      filter_args : a dictionary of arguments to pass to filter_func.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
      nproc : the number of processQueue processes;
              if None defaults to the number of CPUs.
      queue_size : maximum size of the argument queue;
                   if None defaults to 2*nproc.
                 
    Returns:
      list: a list of successful output file names
    """
    # Define output file label dictionary
    cmd_dict = {
        filterLength: 'length',
        filterMissing: 'missing',
        filterRepeats: 'repeats',
        filterQuality: 'quality',
        maskQuality: 'maskqual',
        trimQuality: 'trimqual'
    }

    # Print parameter info
    log = OrderedDict()
    log['START'] = 'FilterSeq'
    log['COMMAND'] = cmd_dict.get(filter_func, filter_func.__name__)
    log['FILE'] = os.path.basename(seq_file)
    for k in sorted(filter_args):
        log[k.upper()] = filter_args[k]
    log['NPROC'] = nproc
    printLog(log)

    # Check input type
    in_type = getFileType(seq_file)
    if in_type != 'fastq' and filter_func in (filterQuality, maskQuality,
                                              trimQuality):
        printError('Input file must be FASTQ for %s mode.' %
                   cmd_dict[filter_func])

    # Define feeder function and arguments
    feed_func = feedSeqQueue
    feed_args = {'seq_file': seq_file}
    # Define worker function and arguments
    work_func = processSeqQueue
    work_args = {'process_func': filter_func, 'process_args': filter_args}
    # Define collector function and arguments
    collect_func = collectSeqQueue
    collect_args = {
        'seq_file': seq_file,
        'label': cmd_dict[filter_func],
        'out_file': out_file,
        'out_args': out_args
    }

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, feed_args,
                             work_args, collect_args, nproc, queue_size)

    # Print log
    result['log']['END'] = 'FilterSeq'
    printLog(result['log'])

    return result['out_files']
Exemple #30
0
def collectDbQueue(alive,
                   result_queue,
                   collect_queue,
                   db_file,
                   task_label,
                   out_args,
                   add_fields=None):
    """
    Pulls from results queue, assembles results and manages log and file IO

    Arguments:
      alive : multiprocessing.Value boolean controlling whether processing
              continues; when False function returns
      result_queue : multiprocessing.Queue holding worker results
      collect_queue : multiprocessing.Queue to store collector return values
      db_file : Database file name
      task_label : Task label used to tag the output files
      out_args : Common output argument dictionary from parseCommonArgs
      add_fields : List of fields added to the writer not present in the in_file;
                 if None do not add fields

    Returns:
      None : Adds a dictionary with key value pairs to collect_queue containing
            'log' defining a log object,
            'out_files' defining the output file names
    """
    try:
        result_count = countDbFile(db_file)

        # Define output format
        out_type = getFileType(db_file) if out_args['out_type'] is None \
                   else out_args['out_type']

        # Defined valid alignment output handle
        pass_handle = getOutputHandle(db_file,
                                      '%s-pass' % task_label,
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_type)
        pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields)
        # Defined failed alignment output handle
        if out_args['failed']:
            fail_handle = getOutputHandle(db_file,
                                          '%s-fail' % task_label,
                                          out_dir=out_args['out_dir'],
                                          out_name=out_args['out_name'],
                                          out_type=out_type)
            fail_writer = getDbWriter(fail_handle, db_file)
        else:
            fail_handle = None

        # Define log handle
        if out_args['log_file'] is None:
            log_handle = None
        else:
            log_handle = open(out_args['log_file'], 'w')
    except:
        alive.value = False
        raise

    try:
        # Iterator over results queue until sentinel object reached
        start_time = time()
        set_count = rec_count = pass_count = fail_count = 0
        while alive.value:
            # Get result from queue
            if result_queue.empty(): continue
            else: result = result_queue.get()
            # Exit upon reaching sentinel
            if result is None: break

            # Print progress for previous iteration
            printProgress(pass_count, result_count, 0.05, start_time)

            # Update counts for current iteration
            set_count += 1
            rec_count += result.data_count

            # Write log
            if result.log is not None:
                printLog(result.log, handle=log_handle)

            # Write alignments
            if result:
                pass_count += result.data_count
                if isinstance(result.results, IgRecord):
                    pass_writer.writerow(result.results.toDict())
                else:
                    for rec in result.results:
                        pass_writer.writerow(rec.toDict())
            else:
                fail_count += result.data_count
                if fail_handle is not None:
                    if isinstance(result.data, IgRecord):
                        pass_writer.writerow(result.data.toDict())
                    else:
                        for rec in result.data:
                            fail_writer.writerow(rec.toDict())
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None

        # Print total counts
        printProgress(pass_count, result_count, 0.05, start_time)

        # Update return values
        log = OrderedDict()
        log['OUTPUT'] = os.path.basename(pass_handle.name)
        log['RECORDS'] = rec_count
        log['GROUPS'] = set_count
        log['PASS'] = pass_count
        log['FAIL'] = fail_count
        collect_dict = {'log': log, 'out_files': [pass_handle.name]}
        collect_queue.put(collect_dict)

        # Close file handles
        pass_handle.close()
        if fail_handle is not None: fail_handle.close()
        if log_handle is not None: log_handle.close()
    except:
        alive.value = False
        raise

    return None
def collectQueue(alive, result_queue, collect_queue, db_file, out_args, cluster_func=None, cluster_args={}):
    """
    Assembles results from a queue of individual sequence results and manages log/file I/O

    Arguments: 
    alive = a multiprocessing.Value boolean controlling whether processing continues
            if False exit process
    result_queue = a multiprocessing.Queue holding processQueue results
    collect_queue = a multiprocessing.Queue to store collector return values
    db_file = the input database file name
    out_args = common output argument dictionary from parseCommonArgs
    cluster_func = the function to call for carrying out clustering on distance matrix
    cluster_args = a dictionary of arguments to pass to cluster_func
    
    Returns: 
    None
    (adds 'log' and 'out_files' to collect_dict)
    """
    # Open output files
    try:
        # Count records and define output format 
        out_type = getFileType(db_file) if out_args['out_type'] is None \
                   else out_args['out_type']
        result_count = countDbFile(db_file)
        
        # Defined successful output handle
        pass_handle = getOutputHandle(db_file, 
                                      out_label='clone-pass', 
                                      out_dir=out_args['out_dir'], 
                                      out_name=out_args['out_name'], 
                                      out_type=out_type)
        pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE')
        
        # Defined failed alignment output handle
        if out_args['failed']:
            fail_handle = getOutputHandle(db_file,
                                          out_label='clone-fail', 
                                          out_dir=out_args['out_dir'], 
                                          out_name=out_args['out_name'], 
                                          out_type=out_type)
            fail_writer = getDbWriter(fail_handle, db_file)
        else:
            fail_handle = None
            fail_writer = None

        # Define log handle
        if out_args['log_file'] is None:  
            log_handle = None
        else:  
            log_handle = open(out_args['log_file'], 'w')
    except:
        #sys.stderr.write('Exception in collector file opening step\n')
        alive.value = False
        raise

    # Get results from queue and write to files
    try:
        #print 'START COLLECT', alive.value
        # Iterator over results queue until sentinel object reached
        start_time = time()
        rec_count = clone_count = pass_count = fail_count = 0
        while alive.value:
            # Get result from queue
            if result_queue.empty():  continue
            else:  result = result_queue.get()
            # Exit upon reaching sentinel
            if result is None:  break
            #print "COLLECT", alive.value, result['id']
            
            # Print progress for previous iteration and update record count
            if rec_count == 0:
                print('PROGRESS> Assigning clones')
            printProgress(rec_count, result_count, 0.05, start_time) 
            rec_count += len(result.data)
            
            # Write passed and failed records
            if result:
                for clone in result.results.values():
                    clone_count += 1
                    for i, rec in enumerate(clone):
                        rec.annotations['CLONE'] = clone_count
                        pass_writer.writerow(rec.toDict())
                        pass_count += 1
                        result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction)
    
            else:
                for i, rec in enumerate(result.data):
                    if fail_writer is not None: fail_writer.writerow(rec.toDict())
                    fail_count += 1
                    result.log['CLONE0-%i' % (i + 1)] = str(rec.junction)
                    
            # Write log
            printLog(result.log, handle=log_handle)
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None
        
        # Print total counts
        printProgress(rec_count, result_count, 0.05, start_time)

        # Close file handles
        pass_handle.close()
        if fail_handle is not None:  fail_handle.close()
        if log_handle is not None:  log_handle.close()
                
        # Update return list
        log = OrderedDict()
        log['OUTPUT'] = os.path.basename(pass_handle.name)
        log['CLONES'] = clone_count
        log['RECORDS'] = rec_count
        log['PASS'] = pass_count
        log['FAIL'] = fail_count
        collect_dict = {'log':log, 'out_files': [pass_handle.name]}
        collect_queue.put(collect_dict)
    except:
        #sys.stderr.write('Exception in collector result processing step\n')
        alive.value = False
        raise

    return None
def collectDbQueue(alive, result_queue, collect_queue, db_file, task_label, out_args,
                   add_fields=None):
    """
    Pulls from results queue, assembles results and manages log and file IO

    Arguments:
      alive : multiprocessing.Value boolean controlling whether processing
              continues; when False function returns
      result_queue : multiprocessing.Queue holding worker results
      collect_queue : multiprocessing.Queue to store collector return values
      db_file : Database file name
      task_label : Task label used to tag the output files
      out_args : Common output argument dictionary from parseCommonArgs
      add_fields : List of fields added to the writer not present in the in_file;
                 if None do not add fields

    Returns:
      None : Adds a dictionary with key value pairs to collect_queue containing
            'log' defining a log object,
            'out_files' defining the output file names
    """
    try:
        result_count = countDbFile(db_file)

        # Define output format
        out_type = getFileType(db_file) if out_args['out_type'] is None \
                   else out_args['out_type']

        # Defined valid alignment output handle
        pass_handle = getOutputHandle(db_file,
                                      '%s-pass' % task_label,
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_type)
        pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields)
        # Defined failed alignment output handle
        if out_args['failed']:
            fail_handle = getOutputHandle(db_file,
                                          '%s-fail'  % task_label,
                                          out_dir=out_args['out_dir'],
                                          out_name=out_args['out_name'],
                                          out_type=out_type)
            fail_writer = getDbWriter(fail_handle, db_file)
        else:
            fail_handle = None

        # Define log handle
        if out_args['log_file'] is None:
            log_handle = None
        else:
            log_handle = open(out_args['log_file'], 'w')
    except:
        alive.value = False
        raise

    try:
        # Iterator over results queue until sentinel object reached
        start_time = time()
        set_count = rec_count = pass_count = fail_count = 0
        while alive.value:
            # Get result from queue
            if result_queue.empty():  continue
            else:  result = result_queue.get()
            # Exit upon reaching sentinel
            if result is None:  break

            # Print progress for previous iteration
            printProgress(pass_count, result_count, 0.05, start_time)

            # Update counts for current iteration
            set_count += 1
            rec_count += result.data_count

            # Write log
            printLog(result.log, handle=log_handle)

            # Write alignments
            if result:
                pass_count += result.data_count
                for rec in result.results:
                    pass_writer.writerow(rec.toDict())
            else:
                fail_count += result.data_count
                if fail_handle is not None:
                    for rec in result.data:
                        pass_writer.writerow(rec.toDict())
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None

        # Print total counts
        printProgress(pass_count, result_count, 0.05, start_time)

        # Update return values
        log = OrderedDict()
        log['OUTPUT'] = os.path.basename(pass_handle.name)
        log['RECORDS'] = rec_count
        log['GROUPS'] = set_count
        log['PASS'] = pass_count
        log['FAIL'] = fail_count
        collect_dict = {'log':log, 'out_files': [pass_handle.name]}
        collect_queue.put(collect_dict)

        # Close file handles
        pass_handle.close()
        if fail_handle is not None:  fail_handle.close()
        if log_handle is not None:  log_handle.close()
    except:
        alive.value = False
        raise

    return None
def convertHeaders(seq_file, convert_func, convert_args={}, out_args=default_out_args):
    """
    Converts sequence headers to the pRESTO format

    Arguments:
    seq_file = the sequence file name
    convert_func = the function used to convert sequence headers
    convert_args = a dictionary of arguments to pass to convert_func
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    the output sequence file name
    """
    # Define subcommand label dictionary
    cmd_dict = {convertGenericHeader:'generic',
                convert454Header:'454',
                convertGenbankHeader:'genbank',
                convertIlluminaHeader:'illumina',
                convertIMGTHeader:'imgt',
                convertSRAHeader:'sra'}

    log = OrderedDict()
    log['START'] = 'ConvertHeaders'
    log['COMMAND'] = cmd_dict[convert_func]
    log['FILE'] = os.path.basename(seq_file)
    printLog(log)

    # Open input file
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type

    # Count records
    result_count = countSeqFile(seq_file)

    # Open output file handles
    pass_handle = getOutputHandle(seq_file,
                                  'convert-pass',
                                  out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'],
                                  out_type=out_args['out_type'])
    if out_args['failed']:
        fail_handle = getOutputHandle(seq_file,
                                      'convert-fail',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
    else:
        fail_handle = None

    # Set additional conversion arguments
    if convert_func in [convertGenericHeader, convertGenbankHeader]:
        convert_args.update({'delimiter':out_args['delimiter']})

    # Iterate over sequences
    start_time = time()
    seq_count = pass_count = fail_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration and update count
        printProgress(seq_count, result_count, 0.05, start_time)
        seq_count += 1

        # Convert header
        header = convert_func(seq.description, **convert_args)

        if header is not None:
            # Write successfully converted sequences
            pass_count += 1
            seq.id = seq.name = flattenAnnotation(header, out_args['delimiter'])
            seq.description = ''
            SeqIO.write(seq, pass_handle, out_args['out_type'])
        else:
            fail_count += 1
            if fail_handle is not None:
                # Write successfully unconverted sequences
                SeqIO.write(seq, fail_handle, out_args['out_type'])

    # Print counts
    printProgress(seq_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['SEQUENCES'] = seq_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'ConvertHeaders'
    printLog(log)

    # Close file handles
    pass_handle.close()
    if fail_handle is not None:  fail_handle.close()

    return pass_handle.name
Exemple #34
0
def pairSeq(seq_file_1,
            seq_file_2,
            fields_1=None,
            fields_2=None,
            action=None,
            coord_type=default_coord,
            out_args=default_out_args):
    """
    Syncronized paired end files and copies annotations between them

    Arguments: 
      seq_file_1 : the file containing the grouped sequences and annotations.
      seq_file_2 : the file to assign annotations to from seq_file_1.
      fields_1 : list of annotations in seq_file_1 records to copy to seq_file_2 records;
                 if None do not copy any annotations.
      fields_2 : list of annotations in seq_file_2 records to copy to seq_file_1 records;
                 if None do not copy any annotations.
      action : the collapse action to take on all copied annotation if they already exist in the
               target header.
      coord_type : the sequence header format.
      out_args : common output argument dictionary from parseCommonArgs.
                    
    Returns: 
      list: a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2).
    """

    # Define private functions
    def _key_func(x):
        return getCoordKey(x,
                           coord_type=coord_type,
                           delimiter=out_args['delimiter'])

    log = OrderedDict()
    log['START'] = 'PairSeq'
    log['FILE1'] = os.path.basename(seq_file_1)
    log['FILE2'] = os.path.basename(seq_file_2)
    log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None
    log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None
    log['COORD_TYPE'] = coord_type
    printLog(log)

    # Define output type
    if out_args['out_type'] is None:
        out_type_1 = getFileType(seq_file_1)
        out_type_2 = getFileType(seq_file_2)
    else:
        out_type_1 = out_type_2 = out_args['out_type']

    # Define output name
    if out_args['out_name'] is None:
        out_name_1 = out_name_2 = None
    else:
        out_name_1 = '%s-1' % out_args['out_name']
        out_name_2 = '%s-2' % out_args['out_name']

    # Open and count files
    start_time = time()
    printMessage("Indexing files", start_time=start_time)
    # Index file 1
    seq_count_1 = countSeqFile(seq_file_1)
    seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func)
    # Define file 2 iterator
    seq_count_2 = countSeqFile(seq_file_2)
    seq_iter_2 = readSeqFile(seq_file_2, index=False)
    printMessage("Done", start_time=start_time, end=True)

    # Open output file handles
    pass_handle_1 = getOutputHandle(seq_file_1,
                                    'pair-pass',
                                    out_args['out_dir'],
                                    out_name=out_name_1,
                                    out_type=out_type_1)
    pass_handle_2 = getOutputHandle(seq_file_2,
                                    'pair-pass',
                                    out_args['out_dir'],
                                    out_name=out_name_2,
                                    out_type=out_type_2)

    if out_args['failed']:
        fail_handle_1 = getOutputHandle(seq_file_1,
                                        'pair-fail',
                                        out_dir=out_args['out_dir'],
                                        out_name=out_name_1,
                                        out_type=out_type_1)
        fail_handle_2 = getOutputHandle(seq_file_2,
                                        'pair-fail',
                                        out_dir=out_args['out_dir'],
                                        out_name=out_name_2,
                                        out_type=out_type_2)
        pass_keys = list()

    # Iterate over pairs and write to output files
    start_time = time()
    rec_count = pair_count = 0
    for seq_2 in seq_iter_2:
        # Print progress for previous iteration
        printProgress(rec_count, seq_count_2, 0.05, start_time=start_time)
        rec_count += 1

        # Check for file 2 mate pair in file 1
        coord_2 = getCoordKey(seq_2.id,
                              coord_type=coord_type,
                              delimiter=out_args['delimiter'])
        seq_1 = seq_dict_1.get(coord_2, None)

        if seq_1 is not None:
            # Record paired keys
            pair_count += 1

            if fields_1 is not None or fields_2 is not None:
                ann_1 = parseAnnotation(seq_1.description,
                                        delimiter=out_args['delimiter'])
                ann_2 = parseAnnotation(seq_2.description,
                                        delimiter=out_args['delimiter'])

                # Prepend annotations from seq_1 to seq_2
                if fields_1 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \
                                            if k in fields_1])
                    merge_ann = mergeAnnotation(
                        ann_2,
                        copy_ann,
                        prepend=True,
                        delimiter=out_args['delimiter'])
                    # Collapse if necessary
                    if action is not None:
                        merge_ann = collapseAnnotation(
                            merge_ann,
                            action,
                            fields=fields_1,
                            delimiter=out_args['delimiter'])
                    # Flatten
                    seq_2.id = flattenAnnotation(
                        merge_ann, delimiter=out_args['delimiter'])
                    seq_2.description = ''

                # Append annotations from seq_2 to seq_1
                if fields_2 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \
                                            if k in fields_2])
                    merge_ann = mergeAnnotation(
                        ann_1,
                        copy_ann,
                        prepend=False,
                        delimiter=out_args['delimiter'])
                    # Collapse if necessary
                    if action is not None:
                        merge_ann = collapseAnnotation(
                            merge_ann,
                            action,
                            fields=fields_2,
                            delimiter=out_args['delimiter'])
                    # Flatten
                    seq_1.id = flattenAnnotation(
                        merge_ann, delimiter=out_args['delimiter'])
                    seq_1.description = ''

            # Write paired records
            SeqIO.write(seq_1, pass_handle_1, out_type_1)
            SeqIO.write(seq_2, pass_handle_2, out_type_2)

        # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records
        if out_args['failed']:
            if seq_1 is not None: pass_keys.append(coord_2)
            else: SeqIO.write(seq_2, fail_handle_2, out_type_2)

    # Print final progress
    printProgress(rec_count, seq_count_2, 0.05, start_time=start_time)

    # Find and write unpaired file 1 records
    if out_args['failed']:
        start_time = time()
        printMessage("Finding unpaired", start_time=start_time)

        # Find file 1 unpaired keys
        pass_keys = set(pass_keys)
        unpaired = set(seq_dict_1).difference(pass_keys)
        # Write unpaired file 1 records
        for k in unpaired:
            SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1)

        printMessage("Done", start_time=start_time, end=True)

    # Print log
    log = OrderedDict()
    log['OUTPUT1'] = os.path.basename(pass_handle_1.name)
    log['OUTPUT2'] = os.path.basename(pass_handle_2.name)
    log['SEQUENCES1'] = seq_count_1
    log['SEQUENCES2'] = seq_count_2
    log['PASS'] = pair_count
    log['END'] = 'PairSeq'
    printLog(log)

    # Close file handles
    pass_handle_1.close()
    pass_handle_2.close()

    return [(pass_handle_1.name, pass_handle_2.name)]