Ejemplo n.º 1
0
    def setUp(self):
        print('-> %s()' % self._testMethodName)

        # Set pandas options
        pd.set_option('display.width', 200)

        # Load data
        self.head_list = [s.upper() for s in readSeqFile(head_file)]
        self.tail_list = [s.upper() for s in readSeqFile(tail_file)]
        self.ref_dict = {s.id: s.upper() for s in readSeqFile(ref_file)}

        self.start = time.time()
 def _read_pairs(seq_file_1, seq_file_2):
     iter_1 = readSeqFile(seq_file_1, index=False)
     iter_2 = readSeqFile(seq_file_2, index=False)
     for seq_1, seq_2 in zip(iter_1, iter_2):
         key_1 = getCoordKey(seq_1.description, coord_type=coord_type,
                             delimiter=delimiter)
         key_2 = getCoordKey(seq_2.description, coord_type=coord_type,
                             delimiter=delimiter)
         if key_1 == key_2:
             yield (key_1, [seq_1, seq_2])
         else:
             raise Exception('Coordinates for sequences %s and %s do not match' \
                              % (key_1, key_2))
Ejemplo n.º 3
0
def feedSeqQueue(alive, data_queue, seq_file, index_func=None, index_args={}):
    """
    Feeds the data queue with SeqRecord objects

    Arguments:
      alive : multiprocessing.Value boolean controlling whether processing
              continues; when False function returns
      data_queue : multiprocessing.Queue to hold data for processing
      seq_file : Sequence file to read input from
      index_func : Function to use to define sequence sets
                   if None do not index sets and feed individual records
      index_args : Dictionary of arguments to pass to index_func

    Returns:
      None
    """
    try:
        # Read input file and index sequence sets if required
        if index_func is None:
            seq_iter = readSeqFile(seq_file)
            data_iter = ((s.id, s) for s in seq_iter)
        else:
            seq_dict = readSeqFile(seq_file, index=True)
            index_dict = index_func(seq_dict, **index_args)
            data_iter = ((k, [seq_dict[i] for i in v]) \
                         for k, v in index_dict.items())
    except:
        alive.value = False
        raise

    try:
        # Iterate over data_iter and feed data queue
        while alive.value:
            # Get data from queue
            if data_queue.full(): continue
            else: data = next(data_iter, None)
            # Exit upon reaching end of iterator
            if data is None: break

            # Feed queue
            data_queue.put(SeqData(*data))
        else:
            sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None
    except:
        alive.value = False
        raise

    return None
Ejemplo n.º 4
0
 def _read_pairs(seq_file_1, seq_file_2):
     iter_1 = readSeqFile(seq_file_1, index=False)
     iter_2 = readSeqFile(seq_file_2, index=False)
     for seq_1, seq_2 in zip(iter_1, iter_2):
         key_1 = getCoordKey(seq_1.description,
                             coord_type=coord_type,
                             delimiter=delimiter)
         key_2 = getCoordKey(seq_2.description,
                             coord_type=coord_type,
                             delimiter=delimiter)
         if key_1 == key_2:
             yield (key_1, [seq_1, seq_2])
         else:
             raise Exception('Coordinates for sequences %s and %s do not match' \
                              % (key_1, key_2))
Ejemplo n.º 5
0
 def _read_pairs(seq_file_1, seq_file_2):
     iter_1 = readSeqFile(seq_file_1, index=False)
     iter_2 = readSeqFile(seq_file_2, index=False)
     for seq_1, seq_2 in zip(iter_1, iter_2):
         key_1 = getCoordKey(seq_1.description, coord_type=coord_type,
                             delimiter=delimiter)
         key_2 = getCoordKey(seq_2.description, coord_type=coord_type,
                             delimiter=delimiter)
         if key_1 == key_2 or coord_type == 'r1':
             yield (key_1, [seq_1, seq_2])
         else:
             import pdb; pdb.set_trace()
             pass; # args.debug=False
             raise Exception('Coordinates for sequences %s and %s do not match' \
                              % (key_1, key_2))
Ejemplo n.º 6
0
def getSeqDict(seq_file):
    """
    Create a dictionary from a sequence file.

    Arguments:
      seq_file : sequence file.

    Returns:
        dict : sequence description as keys with Bio.SeqRecords as values.
    """
    seq_dict = SeqIO.to_dict(readSeqFile(seq_file), key_function=lambda x: x.description)

    return seq_dict
Ejemplo n.º 7
0
def getIDforIMGT(seq_file):
    """
    Create a sequence ID translation using IMGT truncation.

    Arguments:
      seq_file : a fasta file of sequences input to IMGT.

    Returns:
      dict : a dictionary of with the IMGT truncated ID as the key and the full sequence description as the value.
    """

    # Create a sequence ID translation using IDs truncate up to space or 50 chars
    ids = {}
    for rec in readSeqFile(seq_file):
        if len(rec.description) <= 50:
            id_key = rec.description
        else:
            id_key = re.sub('\||\s|!|&|\*|<|>|\?', '_', rec.description[:50])
        ids.update({id_key: rec.description})

    return ids
Ejemplo n.º 8
0
def groupSeqFile(seq_file, field, threshold=None, out_args=default_out_args):
    """
    Divides a sequence file into segments by description tags

    Arguments: 
      seq_file : filename of the sequence file to split
      field : The annotation field to split seq_file by
      threshold : The numerical threshold for group sequences by;
                  if None treat field as textual
      out_args : common output argument dictionary from parseCommonArgs

    Returns: 
      list: output file names
    """
    log = OrderedDict()
    log['START'] = 'SplitSeq'
    log['COMMAND'] = 'group'
    log['FILE'] = os.path.basename(seq_file)
    log['FIELD'] = field
    log['THRESHOLD'] = threshold
    printLog(log)

    # Open file handles
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None: out_args['out_type'] = in_type

    # Determine total numbers of records
    rec_count = countSeqFile(seq_file)

    # Process sequences
    start_time = time()
    seq_count = 0
    if threshold is None:
        # Sort records into files based on textual field
        # Create set of unique field tags
        temp_iter = readSeqFile(seq_file)
        tag_list = getAnnotationValues(temp_iter,
                                       field,
                                       unique=True,
                                       delimiter=out_args['delimiter'])

        if sys.platform != 'win32':
            import resource
            # Increase open file handle limit if needed
            file_limit = resource.getrlimit(resource.RLIMIT_NOFILE)[0]
            file_count = len(tag_list) + 256
            if file_limit < file_count and file_count <= 8192:
                #print file_limit, file_count
                resource.setrlimit(resource.RLIMIT_NOFILE,
                                   (file_count, file_count))
            elif file_count > 8192:
                e = '''OS file limit would need to be set to %i.
                    If you are sure you want to do this, then increase the 
                    file limit in the OS (via ulimit) and rerun this tool.
                    ''' % file_count
                printError(dedent(e))

        # Create output handles
        # out_label = '%s=%s' % (field, tag)
        handles_dict = {
            tag: getOutputHandle(seq_file,
                                 '%s-%s' % (field, tag),
                                 out_dir=out_args['out_dir'],
                                 out_name=out_args['out_name'],
                                 out_type=out_args['out_type'])
            for tag in tag_list
        }

        # Iterate over sequences
        for seq in seq_iter:
            printProgress(seq_count, rec_count, 0.05, start_time=start_time)
            seq_count += 1
            # Write sequences
            tag = parseAnnotation(seq.description,
                                  delimiter=out_args['delimiter'])[field]
            SeqIO.write(seq, handles_dict[tag], out_args['out_type'])
    else:
        # Sort records into files based on numeric threshold
        threshold = float(threshold)
        # Create output handles
        handles_dict = {
            'under':
            getOutputHandle(seq_file,
                            'under-%.1g' % threshold,
                            out_dir=out_args['out_dir'],
                            out_name=out_args['out_name'],
                            out_type=out_args['out_type']),
            'atleast':
            getOutputHandle(seq_file,
                            'atleast-%.1g' % threshold,
                            out_dir=out_args['out_dir'],
                            out_name=out_args['out_name'],
                            out_type=out_args['out_type'])
        }

        # Iterate over sequences
        for seq in seq_iter:
            printProgress(seq_count, rec_count, 0.05, start_time=start_time)
            seq_count += 1
            # Write sequences
            tag = parseAnnotation(seq.description,
                                  delimiter=out_args['delimiter'])[field]
            tag = 'under' if float(tag) < threshold else 'atleast'
            SeqIO.write(seq, handles_dict[tag], out_args['out_type'])

    # Print log
    printProgress(seq_count, rec_count, 0.05, start_time=start_time)
    log = OrderedDict()
    for i, k in enumerate(handles_dict):
        log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name)
    log['SEQUENCES'] = rec_count
    log['PARTS'] = len(handles_dict)
    log['END'] = 'SplitSeq'
    printLog(log)

    # Close output file handles
    for k in handles_dict:
        handles_dict[k].close()

    return [handles_dict[k].name for k in handles_dict]
def tableHeaders(seq_file, fields, out_args=default_out_args):
    """
    Builds a table of sequence header annotations

    Arguments: 
    seq_file = the sequence file name
    fields = the list of fields to output
    out_args = common output argument dictionary from parseCommonArgs
                    
    Returns: 
    the output table file name
    """
    log = OrderedDict()
    log['START'] = 'ParseHeaders'
    log['COMMAND'] = 'table'
    log['FILE'] = os.path.basename(seq_file)
    printLog(log)
    
    # Open file handles
    seq_iter = readSeqFile(seq_file)
    out_handle = getOutputHandle(seq_file, out_label='headers', out_dir=out_args['out_dir'], 
                                 out_name=out_args['out_name'], out_type='tab')
    # Count records
    result_count = countSeqFile(seq_file)
    
    # Open csv writer and write header
    out_writer = csv.DictWriter(out_handle, extrasaction='ignore', restval='', 
                                delimiter='\t', fieldnames=fields)
    out_writer.writeheader()
    
    # Iterate over sequences
    start_time = time()
    seq_count = pass_count = fail_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, result_count, 0.05, start_time)
        
        # Get annotations
        seq_count += 1
        ann = parseAnnotation(seq.description, fields, delimiter=out_args['delimiter'])

        # Write records
        if ann:
            pass_count += 1
            out_writer.writerow(ann)
        else:
            fail_count += 1
        
    # Print counts
    printProgress(seq_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['SEQUENCES'] = seq_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'ParseHeaders'
    printLog(log)

    # Close file handles
    out_handle.close()
 
    return out_handle.name
Ejemplo n.º 10
0
def selectSeqFile(seq_file,
                  field,
                  value_list=None,
                  value_file=None,
                  negate=False,
                  out_file=None,
                  out_args=default_out_args):
    """
    Select from a sequence file

    Arguments:
      seq_file : filename of the sequence file to sample from.
      field : the annotation field to check for required values.
      value_list : a list of annotation values that a sample must contain one of.
      value_file : a tab delimited file containing values to select.
      negate : if True select entires that do not contain the specific values.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      str: output file name.
    """

    # Reads value_file
    def _read_file(value_file, field):
        field_list = []
        try:
            with open(value_file, 'rt') as handle:
                reader_dict = csv.DictReader(handle, dialect='excel-tab')
                for row in reader_dict:
                    field_list.append(row[field])
        except IOError:
            printError('File %s cannot be read.' % value_file)
        except:
            printError('File %s is invalid.' % value_file)

        return field_list

    # Print console log
    log = OrderedDict()
    log['START'] = 'SplitSeq'
    log['COMMAND'] = 'select'
    log['FILE'] = os.path.basename(seq_file)
    log['FIELD'] = field
    if value_list is not None:
        log['VALUE_LIST'] = ','.join([str(x) for x in value_list])
    if value_file is not None:
        log['VALUE_FILE'] = os.path.basename(value_file)
    log['NOT'] = negate
    printLog(log)

    # Read value_file
    if value_list is not None and value_file is not None:
        printError('Specify only one of value_list and value_file.')
    elif value_file is not None:
        value_list = _read_file(value_file, field)

    # Read sequence file
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None: out_args['out_type'] = in_type

    # Output output handle
    if out_file is not None:
        out_handle = open(out_file, 'w')
    else:
        out_handle = getOutputHandle(seq_file,
                                     'selected',
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type=out_args['out_type'])

    # Generate subset of records
    start_time = time()
    pass_count, fail_count, rec_count = 0, 0, 0
    value_set = set(value_list)
    for rec in seq_iter:
        printCount(rec_count, 1e5, start_time=start_time)
        rec_count += 1

        # Parse annotations into a list of values
        ann = parseAnnotation(rec.description,
                              delimiter=out_args['delimiter'])[field]
        ann = ann.split(out_args['delimiter'][2])

        # Write
        if xor(negate, not value_set.isdisjoint(ann)):
            # Write
            SeqIO.write(rec, out_handle, out_args['out_type'])
            pass_count += 1
        else:
            fail_count += 1

    printCount(rec_count, 1e5, start_time=start_time, end=True)

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'SplitSeq'
    printLog(log)

    return out_handle.name
Ejemplo n.º 11
0
def samplePairSeqFile(seq_file_1,
                      seq_file_2,
                      max_count,
                      field=None,
                      values=None,
                      coord_type=default_coord,
                      out_args=default_out_args):
    """
    Samples from paired-end sequence files

    Arguments: 
      seq_file_1 : filename of the first paired-end sequence file
      seq_file_2 : filename of the second paired-end sequence file
      max_count : a list of the maximum number of sequences to sample
      field : the annotation field to check for required values
      values : a list of annotation values that a sample must contain one of
      coord_type : the sequence header format
      out_args : common output argument dictionary from parseCommonArgs
              
    Returns: 
      list: seq_file_1 and seq_file_2 output file names
    """

    # Sequence index key function
    def _key_func(x):
        return getCoordKey(x,
                           coord_type=coord_type,
                           delimiter=out_args['delimiter'])

    # Function to sample from two lists of sequence indices
    def _sample_list(n, index_1, index_2):
        key_set = set(index_1).intersection(index_2)
        max_n = len(key_set)
        return random.sample(key_set, min(n, max_n))

    # Function to sample from two dictionaries of grouped sequence indices
    def _sample_dict(n, index_1, index_2):
        group_set = set(index_1.keys()).intersection(index_2.keys())
        sample_list = []
        for k in group_set:
            key_set = set(index_1[k]).intersection(index_2[k])
            max_n = len(key_set)
            sample_list.extend(random.sample(key_set, min(n, max_n)))
        return sample_list

    # Print console log
    log = OrderedDict()
    log['START'] = 'SplitSeq'
    log['COMMAND'] = 'samplepair'
    log['FILE1'] = os.path.basename(seq_file_1)
    log['FILE2'] = os.path.basename(seq_file_2)
    log['MAX_COUNTS'] = ','.join([str(x) for x in max_count])
    log['FIELD'] = field
    log['VALUES'] = ','.join(values) if values else None
    printLog(log)

    # Define output type
    in_type_1 = getFileType(seq_file_1)
    in_type_2 = getFileType(seq_file_2)
    if out_args['out_type'] is None:
        out_type_1 = in_type_1
        out_type_2 = in_type_2
    else:
        out_type_1 = out_type_2 = out_args['out_type']

    # Define output name
    if out_args['out_name'] is None:
        out_name_1 = out_name_2 = None
    else:
        out_name_1 = '%s-1' % out_args['out_name']
        out_name_2 = '%s-2' % out_args['out_name']

    # Index input files
    start_time = time()
    printMessage('Reading files', start_time=start_time, width=25)

    seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func)
    seq_dict_2 = readSeqFile(seq_file_2, index=True, key_func=_key_func)

    # Subset keys to those meeting field/value criteria
    if field is not None and values is not None:
        _sample = _sample_list
        printMessage('Subsetting by annotation',
                     start_time=start_time,
                     width=25)
        seq_index_1 = subsetSeqIndex(seq_dict_1,
                                     field,
                                     values,
                                     delimiter=out_args['delimiter'])
        seq_index_2 = subsetSeqIndex(seq_dict_2,
                                     field,
                                     values,
                                     delimiter=out_args['delimiter'])
    elif field is not None and values is None:
        _sample = _sample_dict
        printMessage('Indexing by annotation', start_time=start_time, width=25)
        seq_index_1 = indexSeqSets(seq_dict_1,
                                   field,
                                   delimiter=out_args['delimiter'])
        seq_index_2 = indexSeqSets(seq_dict_2,
                                   field,
                                   delimiter=out_args['delimiter'])
    else:
        _sample = _sample_list
        seq_index_1 = list(seq_dict_1.keys())
        seq_index_2 = list(seq_dict_2.keys())

    printMessage('Done', start_time=start_time, end=True, width=25)

    # Generate sample set for each value in max_count
    out_files = []
    for i, n in enumerate(max_count):
        start_time = time()
        printMessage('Sampling n=%i' % n, start_time=start_time, width=25)
        # Sample
        sample_keys = _sample(n, seq_index_1, seq_index_2)
        sample_count = len(sample_keys)

        # Open file handles
        out_handle_1 = getOutputHandle(seq_file_1,
                                       'sample%i-n%i' % (i + 1, sample_count),
                                       out_dir=out_args['out_dir'],
                                       out_name=out_name_1,
                                       out_type=out_type_1)
        out_handle_2 = getOutputHandle(seq_file_2,
                                       'sample%i-n%i' % (i + 1, sample_count),
                                       out_dir=out_args['out_dir'],
                                       out_name=out_name_2,
                                       out_type=out_type_2)
        out_files.append((out_handle_1.name, out_handle_2.name))

        for k in sample_keys:
            SeqIO.write(seq_dict_1[k], out_handle_1, out_type_1)
            SeqIO.write(seq_dict_2[k], out_handle_2, out_type_2)

        printMessage('Done', start_time=start_time, end=True, width=25)

        # Print log for iteration
        log = OrderedDict()
        log['MAX_COUNT'] = n
        log['SAMPLED'] = sample_count
        log['OUTPUT1'] = os.path.basename(out_files[i][0])
        log['OUTPUT2'] = os.path.basename(out_files[i][1])
        printLog(log)

        # Close file handles
        out_handle_1.close()
        out_handle_2.close()

    # Print log
    log = OrderedDict()
    log['END'] = 'SplitSeq'
    printLog(log)

    return out_files
Ejemplo n.º 12
0
def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None,
                copy_fields=None, copy_actions=None, max_field=None, min_field=None, 
                inner=False, keep_missing=False, out_file=None, out_args=default_out_args):
    """
    Removes duplicate sequences from a file

    Arguments: 
      seq_file : filename of the sequence file to sample from.
      max_missing : number of ambiguous characters to allow in a unique sequence.
      uniq_fields : a list of annotations that define a sequence as unique if they differ.
      copy_fields : a list of annotations to copy into unique sequence annotations.
      copy_actions : the list of collapseAnnotation actions to take on copy_fields.
      max_field : a numeric field whose maximum value determines the retained sequence.
      min_field : a numeric field whose minimum value determines the retained sequence.
      inner : if True exclude consecutive outer ambiguous characters from iterations and matching.
      keep_missing : if True retain sequences with more ambiguous characters than max_missing as unique.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
              
    Returns: 
      str: the collapsed output file name.
    """
    log = OrderedDict()
    log['START'] = 'CollapseSeq'
    log['FILE'] = os.path.basename(seq_file)
    log['MAX_MISSING'] = max_missing
    log['UNIQ_FIELDS'] = ','.join([str(x) for x in uniq_fields]) \
                         if uniq_fields is not None else None
    log['COPY_FIELDS'] = ','.join([str(x) for x in copy_fields]) \
                         if copy_fields is not None else None
    log['COPY_ACTIONS'] = ','.join([str(x) for x in copy_actions]) \
                          if copy_actions is not None else None
    log['MAX_FIELD'] = max_field
    log['MIN_FIELD'] = min_field
    log['INNER'] = inner
    log['KEEP_MISSING'] = keep_missing
    printLog(log)
    
    # Read input file
    in_type = getFileType(seq_file)
    seq_dict = SeqIO.to_dict(readSeqFile(seq_file, index=False))
    if out_args['out_type'] is None:  out_args['out_type'] = in_type

    # Count total sequences
    rec_count = len(seq_dict)

    # Open unique record output handle
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(seq_file,
                                      'collapse-unique',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
    # Define log handle
    if out_args['log_file'] is None:  
        log_handle = None
    else:  
        log_handle = open(out_args['log_file'], 'w')

    # Find sequences with duplicates
    uniq_dict = {}
    # Added list typing for compatibility issue with Python 2.7.5 on OS X
    # TypeError: object of type 'dictionary-keyiterator' has no len()
    search_keys = list(seq_dict.keys())
    dup_keys = []
    for n in range(0, max_missing + 1):
        # Find unique sequences
        uniq_dict, search_keys, dup_list = findUniqueSeq(uniq_dict, search_keys, seq_dict, n, 
                                                         uniq_fields, copy_fields,
                                                         max_field, min_field, inner, 
                                                         out_args['delimiter'])

        # Update list of duplicates
        dup_keys.extend(dup_list)
                
        # Break if no keys to search remain
        if len(search_keys) == 0:  break

    # Write unique sequences
    for val in uniq_dict.values():
        # Define output sequence
        out_seq = val.seq
        out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter'])
        out_app = OrderedDict()
        if copy_fields  is not None and copy_actions is not None:
            for f, a in zip(copy_fields, copy_actions):
                x = collapseAnnotation(val.annotations, a, f, delimiter=out_args['delimiter'])
                out_app[f] = x[f]
                out_ann.pop(f, None)
        out_app['DUPCOUNT'] = val.count
        out_ann = mergeAnnotation(out_ann, out_app, delimiter=out_args['delimiter'])
        out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter'])
        out_seq.description = ''
        # Write unique sequence
        SeqIO.write(out_seq, pass_handle, out_args['out_type'])

        # Update log
        log = OrderedDict()
        log['HEADER'] = out_seq.id
        log['DUPCOUNT'] = val.count
        for i, k in enumerate(val.keys, start=1):
            log['ID%i' % i] = k
        for i, k in enumerate(val.keys, start=1):
            log['SEQ%i' % i] = str(seq_dict[k].seq)
        printLog(log, handle=log_handle)

    # Write sequence with high missing character counts
    if keep_missing:
        for k in search_keys:
            out_seq = seq_dict[k]
            out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter'])
            out_ann = mergeAnnotation(out_ann, {'DUPCOUNT':1}, delimiter=out_args['delimiter'])
            out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter'])
            out_seq.description = ''
            SeqIO.write(out_seq, pass_handle, out_args['out_type'])

    # Write sequence with high missing character counts
    if out_args['failed'] and not keep_missing:
        with getOutputHandle(seq_file, 'collapse-undetermined', out_dir=out_args['out_dir'],
                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
                as missing_handle:
            for k in search_keys:
                SeqIO.write(seq_dict[k], missing_handle, out_args['out_type'])

    if out_args['failed']:
        # Write duplicate sequences 
        with getOutputHandle(seq_file, 'collapse-duplicate', out_dir=out_args['out_dir'], 
                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
                as dup_handle:
            for k in dup_keys:
                SeqIO.write(seq_dict[k], dup_handle, out_args['out_type'])

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['SEQUENCES'] = rec_count
    log['UNIQUE'] = len(uniq_dict)
    log['DUPLICATE'] = len(dup_keys)
    log['UNDETERMINED'] = len(search_keys)
    log['END'] = 'CollapseSeq'
    printLog(log)
        
    # Close file handles
    pass_file = pass_handle.name
    if pass_handle is not None:  pass_handle.close()
    if log_handle is not None:  log_handle.close()
    
    return pass_file
Ejemplo n.º 13
0
def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None,
                copy_fields=None, copy_actions=None, max_field=None, min_field=None, 
                inner=False, keep_missing=False, out_args=default_out_args):
    """
    Removes duplicate sequences from a file

    Arguments: 
    seq_file = filename of the sequence file to sample from
    max_missing = number of ambiguous characters to allow in a unique sequence
    uniq_fields = a list of annotations that define a sequence as unique if they differ
    copy_fields = a list of annotations to copy into unique sequence annotations
    copy_actions = the list of collapseAnnotation actions to take on copy_fields 
    max_field = a numeric field whose maximum value determines the retained sequence
    min_field = a numeric field whose minimum value determines the retained sequence
    inner = if True exclude consecutive outer ambiguous characters from iterations and matching
    keep_missing = if True retain sequences with more ambiguous characters than max_missing as unique
    out_args = common output argument dictionary from parseCommonArgs
              
    Returns: 
    the collapsed output file name
    """
    log = OrderedDict()
    log['START'] = 'CollapseSeq'
    log['FILE'] = os.path.basename(seq_file)
    log['MAX_MISSING'] = max_missing
    log['UNIQ_FIELDS'] = ','.join([str(x) for x in uniq_fields]) \
                         if uniq_fields is not None else None
    log['COPY_FIELDS'] = ','.join([str(x) for x in copy_fields]) \
                         if copy_fields is not None else None
    log['COPY_ACTIONS'] = ','.join([str(x) for x in copy_actions]) \
                          if copy_actions is not None else None
    log['MAX_FIELD'] = max_field
    log['MIN_FIELD'] = min_field
    log['INNER'] = inner
    log['KEEP_MISSING'] = keep_missing
    printLog(log)
    
    # TODO:  storing all sequences in memory is faster
    # Read input file
    in_type = getFileType(seq_file)
    #seq_dict = readSeqFile(seq_file, index=True)
    seq_dict = SeqIO.to_dict(readSeqFile(seq_file, index=False))
    if out_args['out_type'] is None:  out_args['out_type'] = in_type

    # Count total sequences
    rec_count = len(seq_dict)

    # Define log handle
    if out_args['log_file'] is None:  
        log_handle = None
    else:  
        log_handle = open(out_args['log_file'], 'w')

    # Find sequences with duplicates
    uniq_dict = {}
    # Added list typing for compatibility issue with Python 2.7.5 on OS X
    # TypeError: object of type 'dictionary-keyiterator' has no len()
    search_keys = list(seq_dict.keys())
    dup_keys = []
    for n in range(0, max_missing + 1):
        # Find unique sequences
        uniq_dict, search_keys, dup_list = findUniqueSeq(uniq_dict, search_keys, seq_dict, n, 
                                                         uniq_fields, copy_fields,
                                                         max_field, min_field, inner, 
                                                         out_args['delimiter'])
        # Update list of duplicates
        dup_keys.extend(dup_list)

        # Update log
        log = OrderedDict()
        log['ITERATION'] = n + 1
        log['MISSING'] = n 
        log['UNIQUE'] = len(uniq_dict) 
        log['DUPLICATE'] = len(dup_keys) 
        log['UNDETERMINED'] = len(search_keys)
        printLog(log, handle=log_handle)
                
        # Break if no keys to search remain
        if len(search_keys) == 0:  break
    
    # Write unique sequences
    with getOutputHandle(seq_file, 'collapse-unique', out_dir=out_args['out_dir'], 
                         out_name=out_args['out_name'], out_type=out_args['out_type']) \
            as uniq_handle:
        for val in uniq_dict.values():
            # Define output sequence
            out_seq = val[0]
            out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter'])
            out_app = OrderedDict()
            if copy_fields  is not None and copy_actions is not None:
                for f, a, s in zip(copy_fields, copy_actions, val[3:]):
                    out_app[f] = s
                    out_app = collapseAnnotation(out_app, a, f, delimiter=out_args['delimiter'])
                    out_ann.pop(f, None)
            out_app['DUPCOUNT'] = val[1]
            out_ann = mergeAnnotation(out_ann, out_app, delimiter=out_args['delimiter'])
            out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter'])
            out_seq.description = ''
            # Write unique sequence
            SeqIO.write(out_seq, uniq_handle, out_args['out_type'])
    
        # Write sequence with high missing character counts
        if keep_missing:
            for k in search_keys:
                out_seq = seq_dict[k]
                out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter'])
                out_ann = mergeAnnotation(out_ann, {'DUPCOUNT':1}, delimiter=out_args['delimiter'])
                out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter'])
                out_seq.description = ''
                SeqIO.write(out_seq, uniq_handle, out_args['out_type'])

    # Write sequence with high missing character counts
    if out_args['failed'] and not keep_missing:
        with getOutputHandle(seq_file, 'collapse-undetermined', out_dir=out_args['out_dir'],
                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
                as missing_handle:
            for k in search_keys:
                SeqIO.write(seq_dict[k], missing_handle, out_args['out_type'])

    if out_args['failed']:
        # Write duplicate sequences 
        with getOutputHandle(seq_file, 'collapse-duplicate', out_dir=out_args['out_dir'], 
                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
                as dup_handle:
            for k in dup_keys:
                SeqIO.write(seq_dict[k], dup_handle, out_args['out_type'])

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(uniq_handle.name)
    log['SEQUENCES'] = rec_count
    log['UNIQUE'] = len(uniq_dict)
    log['DUPLICATE'] = len(dup_keys)
    log['UNDETERMINED'] = len(search_keys)
    log['END'] = 'CollapseSeq'
    printLog(log)
        
    # Close file handles
    if log_handle is not None:  log_handle.close()
    
    return uniq_handle.name
        args_dict['assemble_args'] = {'alpha': args_dict['alpha'],
                                      'max_error': args_dict['max_error'],
                                      'min_len': args_dict['min_len'],
                                      'max_len': args_dict['max_len'],
                                      'scan_reverse': args_dict['scan_reverse'],
                                      'assembly_stats': AssemblyStats(args_dict['max_len'] + 1)}
        del args_dict['alpha']
        del args_dict['max_error']
        del args_dict['min_len']
        del args_dict['max_len']
        del args_dict['scan_reverse']
    elif args_dict['assemble_func'] is joinSeqPair:
        args_dict['assemble_args'] = {'gap':args_dict['gap']}
        del args_dict['gap']
    elif args_dict['assemble_func'] is referenceAssembly:
        ref_dict = {s.id:s.upper() for s in readSeqFile(args_dict['ref_file'])}
        #ref_file = makeUsearchDb(args_dict['ref_file'], args_dict['usearch_exec'])
        args_dict['assemble_args'] = {'ref_file': args_dict['ref_file'],
                                      'ref_dict': ref_dict,
                                      'min_ident': args_dict['min_ident'],
                                      'evalue': args_dict['evalue'],
                                      'max_hits': args_dict['max_hits'],
                                      'fill': args_dict['fill'],
                                      'usearch_exec': args_dict['usearch_exec']}
        del args_dict['ref_file']
        del args_dict['min_ident']
        del args_dict['evalue']
        del args_dict['max_hits']
        del args_dict['fill']
        del args_dict['usearch_exec']
Ejemplo n.º 15
0
def clusterBarcodes(seq_file,
                    ident=default_cluster_ident,
                    length_ratio=default_length_ratio,
                    barcode_field=default_barcode_field,
                    cluster_field=default_cluster_field,
                    cluster_prefix=default_cluster_prefix,
                    cluster_tool=default_cluster_tool,
                    cluster_exec=default_cluster_exec,
                    out_file=None,
                    out_args=default_out_args,
                    nproc=None):
    """
    Performs clustering on sets of sequences

    Arguments:
      seq_file : the sample sequence file name.
      ident : the identity threshold for clustering sequences.
      length_ratio : minimum short/long length ratio allowed within a cluster.
      barcode_field : the annotation field containing barcode sequences.
      cluster_field : the name of the output cluster field.
      cluster_prefix : string defining a prefix for the cluster identifier.
      seq_start : the start position to trim sequences at before clustering.
      seq_end : the end position to trim sequences at before clustering.
      cluster_tool : the clustering tool to use; one of cd-hit or usearch.
      cluster_exec : the path to the executable for usearch.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : output arguments.
      nproc : the number of processQueue processes;
              if None defaults to the number of CPUs.

    Returns:
      str: the clustered output file name
    """

    # Function to modify SeqRecord header with cluster identifier
    def _header(seq,
                cluster,
                field=cluster_field,
                prefix=cluster_prefix,
                delimiter=out_args['delimiter']):
        label = '%s%i' % (prefix, cluster)
        header = parseAnnotation(seq.description, delimiter=delimiter)
        header = mergeAnnotation(header, {field: label}, delimiter=delimiter)
        seq.id = seq.name = flattenAnnotation(header, delimiter=delimiter)
        seq.description = ''
        return seq

    # Function to extract to make SeqRecord object from a barcode annotation
    def _barcode(seq, field=barcode_field, delimiter=out_args['delimiter']):
        header = parseAnnotation(seq.description, delimiter=delimiter)
        return SeqRecord(Seq(header[field]), id=seq.id)

    # Print parameter info
    log = OrderedDict()
    log['START'] = 'ClusterSets'
    log['COMMAND'] = 'barcode'
    log['FILE'] = os.path.basename(seq_file)
    log['IDENTITY'] = ident
    log['BARCODE_FIELD'] = barcode_field
    log['CLUSTER_FIELD'] = cluster_field
    log['CLUSTER_PREFIX'] = cluster_prefix
    log['CLUSTER_TOOL'] = cluster_tool
    log['NPROC'] = nproc
    printLog(log)

    # Set cluster tool
    try:
        cluster_func = map_cluster_tool.get(cluster_tool)
    except:
        printError('Invalid clustering tool %s.' % cluster_tool)

    # Check the minimum identity
    if ident < min_cluster_ident[cluster_tool]:
        printError('Minimum identity %s too low for clustering tool %s.' %
                   (str(ident), cluster_tool))

    # Count sequence file and parse into a list of SeqRecords
    result_count = countSeqFile(seq_file)
    barcode_iter = (_barcode(x) for x in readSeqFile(seq_file))

    # Perform clustering
    start_time = time()
    printMessage('Running %s' % cluster_tool, start_time=start_time, width=25)
    cluster_dict = cluster_func(barcode_iter,
                                ident=ident,
                                length_ratio=length_ratio,
                                seq_start=0,
                                seq_end=None,
                                threads=nproc,
                                cluster_exec=cluster_exec)
    printMessage('Done', start_time=start_time, end=True, width=25)

    # Determine file type
    if out_args['out_type'] is None:
        out_args['out_type'] = getFileType(seq_file)

    # Open output file handles
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(seq_file,
                                      'cluster-pass',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])

    # Open indexed sequence file
    seq_dict = readSeqFile(seq_file, index=True)

    # Iterate over sequence records and update header with cluster annotation
    start_time = time()
    rec_count = pass_count = 0
    for cluster, id_list in cluster_dict.items():
        printProgress(rec_count, result_count, 0.05, start_time=start_time)
        rec_count += len(id_list)

        # TODO:  make a generator. Figure out how to get pass_count updated
        # Define output sequences
        seq_output = [_header(seq_dict[x], cluster) for x in id_list]

        # Write output
        pass_count += len(seq_output)
        SeqIO.write(seq_output, pass_handle, out_args['out_type'])

    # Update progress
    printProgress(rec_count, result_count, 0.05, start_time=start_time)

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['CLUSTERS'] = len(cluster_dict)
    log['SEQUENCES'] = result_count
    log['PASS'] = pass_count
    log['FAIL'] = rec_count - pass_count
    log['END'] = 'ClusterSets'
    printLog(log)

    # Close handles
    pass_handle.close()

    return pass_handle.name
Ejemplo n.º 16
0
def sampleSeqFile(seq_file,
                  max_count,
                  field=None,
                  values=None,
                  out_args=default_out_args):
    """
    Samples from a sequence file

    Arguments: 
      seq_file : filename of the sequence file to sample from
      max_count : a list of the maximum number of sequences to sample
      field : the annotation field to check for required values
      values : a list of annotation values that a sample must contain one of
      out_args : common output argument dictionary from parseCommonArgs
              
    Returns: 
      str: output file name
    """

    # Function to sample from a list of sequence indices
    def _sample_list(n, index_list):
        max_n = len(index_list)
        r = random.sample(range(max_n), n) if n < max_n else range(max_n)
        return [index_list[x] for x in r]

    # Function to sample from a dictionary of grouped sequence indices
    def _sample_dict(n, index_dict):
        sample_list = []
        for v in index_dict.values():
            max_n = len(v)
            r = random.sample(range(max_n), n) if n < max_n else range(max_n)
            sample_list.extend([v[x] for x in r])
        return sample_list

    # Print console log
    log = OrderedDict()
    log['START'] = 'SplitSeq'
    log['COMMAND'] = 'sample'
    log['FILE'] = os.path.basename(seq_file)
    log['MAX_COUNTS'] = ','.join([str(x) for x in max_count])
    log['FIELD'] = field
    log['VALUES'] = ','.join(values) if values else None
    printLog(log)

    # Read input files and open output files
    start_time = time()
    printMessage('Reading files', start_time=start_time, width=25)

    in_type = getFileType(seq_file)
    seq_dict = readSeqFile(seq_file, index=True)
    if out_args['out_type'] is None: out_args['out_type'] = in_type

    # Generate subset of records
    if field is not None and values is not None:
        _sample = _sample_list
        printMessage('Subsetting by annotation',
                     start_time=start_time,
                     width=25)
        seq_index = subsetSeqIndex(seq_dict,
                                   field,
                                   values,
                                   delimiter=out_args['delimiter'])
    elif field is not None and values is None:
        _sample = _sample_dict
        printMessage('Indexing by annotation', start_time=start_time, width=25)
        seq_index = indexSeqSets(seq_dict,
                                 field,
                                 delimiter=out_args['delimiter'])
    else:
        _sample = _sample_list
        seq_index = [k for k in seq_dict]

    printMessage('Done', start_time=start_time, end=True, width=25)

    # Generate sample set for each value in max_count
    out_files = []
    for i, n in enumerate(max_count):
        start_time = time()
        printMessage('Sampling n=%i' % n, start_time=start_time, width=25)
        # Sample from records
        sample_keys = _sample(n, seq_index)
        sample_count = len(sample_keys)

        # Write sampled sequences to files
        with getOutputHandle(seq_file,
                             'sample%i-n%i' % (i + 1, sample_count),
                             out_dir=out_args['out_dir'],
                             out_name=out_args['out_name'],
                             out_type=out_args['out_type']) as out_handle:
            for k in sample_keys:
                SeqIO.write(seq_dict[k], out_handle, out_args['out_type'])
            out_files.append(out_handle.name)

        printMessage('Done', start_time=start_time, end=True, width=25)

        # Print log for iteration
        log = OrderedDict()
        log['MAX_COUNT'] = n
        log['SAMPLED'] = sample_count
        log['OUTPUT'] = os.path.basename(out_files[i])
        printLog(log)

    # Print log
    log = OrderedDict()
    log['END'] = 'SplitSeq'
    printLog(log)

    return out_files
Ejemplo n.º 17
0
def downsizeSeqFile(seq_file, max_count, out_args=default_out_args):
    """
    Splits a FASTA/FASTQ file into segments with a limited number of records

    Arguments: 
      seq_file : filename of the FASTA file to split
      max_count : number of records in each output file
      out_args : common output argument dictionary from parseCommonArgs

    Returns: 
      list: output file names
    """
    log = OrderedDict()
    log['START'] = 'SplitSeq'
    log['COMMAND'] = 'count'
    log['FILE'] = os.path.basename(seq_file)
    log['MAX_COUNT'] = max_count
    printLog(log)

    # Open file handles
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None: out_args['out_type'] = in_type
    # Determine total numbers of records
    rec_count = countSeqFile(seq_file)

    # Loop through iterator writing each record and opening new output handle as needed
    start_time = time()
    seq_count, part_num = 0, 1
    out_handle = getOutputHandle(seq_file,
                                 'part%06i' % part_num,
                                 out_dir=out_args['out_dir'],
                                 out_name=out_args['out_name'],
                                 out_type=out_args['out_type'])
    out_files = [out_handle.name]
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, rec_count, 0.05, start_time=start_time)

        # Update count
        seq_count += 1

        # Write records
        SeqIO.write(seq, out_handle, out_args['out_type'])
        # Break if total records reached to avoid extra empty file
        if seq_count == rec_count:
            break

        # Open new file if needed
        if seq_count % max_count == 0:
            out_handle.close()
            part_num += 1
            out_handle = getOutputHandle(seq_file,
                                         'part%06i' % part_num,
                                         out_dir=out_args['out_dir'],
                                         out_name=out_args['out_name'],
                                         out_type=out_args['out_type'])
            out_files.append(out_handle.name)

    # Print log
    printProgress(seq_count, rec_count, 0.05, start_time=start_time)
    log = OrderedDict()
    for i, f in enumerate(out_files):
        log['OUTPUT%i' % (i + 1)] = os.path.basename(f)
    log['SEQUENCES'] = rec_count
    log['PARTS'] = len(out_files)
    log['END'] = 'SplitSeq'
    printLog(log)

    # Close file handles
    out_handle.close()

    return out_files
Ejemplo n.º 18
0
def estimateBarcode(seq_file,
                    barcode_field=default_barcode_field,
                    distance_types=default_distance_types,
                    out_args=default_out_args):
    """
    Calculates error rates of barcode sequences

    Arguments: 
      seq_file : the sample sequence file name
      barcode_field : the annotation field containing barcode sequences.
      distance_types : distance types to include.
      out_args : common output argument dictionary from parseCommonArgs
                        
    Returns: 
      tuple: names of the output files.
    """

    # Function to extract to make SeqRecord object from a barcode annotation
    def _barcode(seq, field=barcode_field, delimiter=out_args['delimiter']):
        header = parseAnnotation(seq.description, delimiter=delimiter)
        return header[field]

    # Print parameter info
    log = OrderedDict()
    log['START'] = 'EstimateError'
    log['COMMAND'] = 'barcode'
    log['FILE'] = os.path.basename(seq_file)
    log['BARCODE_FIELD'] = barcode_field
    printLog(log)

    # Count sequence file and parse into a list of SeqRecords
    result_count = countSeqFile(seq_file)
    barcode_iter = (_barcode(x) for x in readSeqFile(seq_file))

    # Compute bin_count defaults to the length of the barcode + 1
    bin_count = len(_barcode(next(readSeqFile(seq_file)))) + 1
    mismatch = initializeMismatchDictionary(0,
                                            distance_types=distance_types,
                                            bin_count=bin_count)

    # Calculate distances
    distance_mismatch = calculateDistances(barcode_iter, bin_count=bin_count)
    mismatch['dist'] = {
        header: distance_mismatch[header]
        for header in distance_types
    }

    # Generate a df
    dist_df = pd.DataFrame.from_dict(mismatch['dist'])
    dist_df.index = dist_df.index / len(dist_df.index)
    dist_df[['all']] = dist_df[['all']].astype(int)

    #find the threshold (average minimum between 0 and 0.75)
    dist = mismatch['dist']['all']
    thresh_df = pd.DataFrame.from_dict({'thresh': {'ALL': dist_df.index[int(np.mean([index for index in np.argsort(dist[:int(len(dist)*0.75)]) \
                                                                                     if dist[index] == np.min(dist)]))]}
                                        })
    file_args = {
        'out_dir': out_args['out_dir'],
        'out_name': out_args['out_name'],
        'out_type': 'tab'
    }

    # Output as tsv
    with getOutputHandle(seq_file, 'distance-barcode', **file_args) as dist_handle, \
        getOutputHandle(seq_file, 'threshold-barcode', **file_args) as thresh_handle:

        dist_df.to_csv(dist_handle,
                       sep='\t',
                       na_rep='NA',
                       index=True,
                       index_label='DISTANCE',
                       columns=['all'],
                       header=['ALL'],
                       float_format='%.6f')
        thresh_df.to_csv(thresh_handle,
                         sep='\t',
                         na_rep='NA',
                         index=True,
                         index_label='TYPE',
                         columns=['thresh'],
                         header=['THRESHOLD'],
                         float_format='%.6f')

    # Update log
    log['OUTPUT1'] = os.path.basename(dist_handle.name)
    log['OUTPUT2'] = os.path.basename(thresh_handle.name)
    log['SEQUENCES'] = result_count
    log['ALL_THRESHOLD'] = '%.6f' % thresh_df['thresh']['ALL']
    log['END'] = 'EstimateError'
    printLog(log)

    return (dist_handle.name, thresh_handle.name)
Ejemplo n.º 19
0
def sortSeqFile(seq_file,
                field,
                numeric=False,
                max_count=None,
                out_args=default_out_args):
    """
    Sorts a sequence file by annotation fields

    Arguments: 
      seq_file : filename of the sequence file to split
      field : position of field in sequence description to split by
      numeric : if True sort field numerically;
                if False sort field alphabetically
      max_count : maximum number of records in each output file
                  if None do not create multiple files
      out_args : common output argument dictionary from parseCommonArgs
    
    Returns: 
      list: output file names
    """
    log = OrderedDict()
    log['START'] = 'SplitSeq'
    log['COMMAND'] = 'sort'
    log['FILE'] = os.path.basename(seq_file)
    log['FIELD'] = field
    log['NUMERIC'] = numeric
    log['MAX_COUNT'] = max_count
    printLog(log)

    # Open file handles
    in_type = getFileType(seq_file)
    seq_dict = readSeqFile(seq_file, index=True)
    if out_args['out_type'] is None: out_args['out_type'] = in_type

    # Get annotations and sort seq_dict by annotation values
    tag_dict = {
        k: parseAnnotation(seq_dict[k].description,
                           delimiter=out_args['delimiter'])[field]
        for k in seq_dict
    }
    if numeric: tag_dict = {k: float(v or 0) for k, v in tag_dict.items()}
    sorted_keys = sorted(tag_dict, key=tag_dict.get)

    # Determine total numbers of records
    rec_count = len(seq_dict)
    if max_count >= rec_count: max_count = None

    # Open initial output file handles
    file_count = 1
    if max_count is None: out_label = 'sorted'
    else: out_label = 'sorted-part%06i' % file_count
    out_handle = getOutputHandle(seq_file,
                                 out_label,
                                 out_dir=out_args['out_dir'],
                                 out_name=out_args['out_name'],
                                 out_type=out_args['out_type'])
    out_files = [out_handle.name]

    # Loop through sorted sequence dictionary keys
    start_time = time()
    last_tag = None
    saved_keys = []
    seq_count = chunk_count = 0
    for key in sorted_keys:
        # Print progress for previous iteration and update count
        printProgress(seq_count, rec_count, 0.05, start_time=start_time)
        seq_count += 1

        # Write saved group of sequences when tag changes
        if last_tag is not None and tag_dict[key] != last_tag:
            # Open new output file if needed
            if max_count is not None and chunk_count + len(
                    saved_keys) > max_count:
                # Update partition counts
                file_count += 1
                chunk_count = 0
                # Open new file handle
                out_handle.close()
                out_handle = getOutputHandle(seq_file,
                                             'sorted-part%06i' % file_count,
                                             out_dir=out_args['out_dir'],
                                             out_name=out_args['out_name'],
                                             out_type=out_args['out_type'])
                # Append output file name to out_files
                out_files.append(out_handle.name)

            # Write saved sequences
            for k in saved_keys:
                chunk_count += 1
                SeqIO.write(seq_dict[k], out_handle, out_args['out_type'])
            # Reset saved keys to current key only
            saved_keys = [key]
        else:
            # Update list of saved keys if tag is unchanged
            saved_keys.append(key)

        # Check if total records reached, write all saved keys, and exit loop
        if seq_count == rec_count:
            for k in saved_keys:
                chunk_count += 1
                SeqIO.write(seq_dict[k], out_handle, out_args['out_type'])
            out_handle.close()
            break

        # Update tag tracker
        last_tag = tag_dict[key]

    # Print log
    printProgress(seq_count, rec_count, 0.05, start_time=start_time)
    log = OrderedDict()
    for i, f in enumerate(out_files):
        log['OUTPUT%i' % (i + 1)] = os.path.basename(f)
    log['SEQUENCES'] = seq_count
    log['PARTS'] = len(out_files)
    log['END'] = 'SplitSeq'
    printLog(log)

    # Close file handles
    out_handle.close()

    return out_files
Ejemplo n.º 20
0
def convertHeaders(seq_file, convert_func, convert_args={}, out_file=None, out_args=default_out_args):
    """
    Converts sequence headers to the pRESTO format

    Arguments:
      seq_file : the sequence file name.
      convert_func : the function used to convert sequence headers.
      convert_args : a dictionary of arguments to pass to convert_func.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      str: the output sequence file name.
    """
    # Define subcommand label dictionary
    cmd_dict = {convertGenericHeader: 'generic',
                convert454Header: '454',
                convertGenbankHeader: 'genbank',
                convertIlluminaHeader: 'illumina',
                convertIMGTHeader: 'imgt',
                convertMIGECHeader: 'migec',
                convertSRAHeader: 'sra'}

    log = OrderedDict()
    log['START'] = 'ConvertHeaders'
    log['COMMAND'] = cmd_dict[convert_func]
    log['FILE'] = os.path.basename(seq_file)
    printLog(log)

    # Open input file
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type

    # Wrapper for opening handles and writers
    def _open(x, out_file=out_file):
        if out_file is not None and x == 'pass':
            handle = open(out_file, 'w')
        else:
            handle = getOutputHandle(seq_file,
                                     'convert-%s' % x,
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type=out_args['out_type'])
        return handle

    # Count records
    result_count = countSeqFile(seq_file)

    # Set additional conversion arguments
    if convert_func in [convertGenericHeader, convertGenbankHeader]:
        convert_args.update({'delimiter': out_args['delimiter']})

    # Intialize file handles
    pass_handle, fail_handle = None, None

    # Iterate over sequences
    start_time = time()
    seq_count = pass_count = fail_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration and update count
        printProgress(seq_count, result_count, 0.05, start_time=start_time)
        seq_count += 1

        # Convert header
        header = convert_func(seq.description, **convert_args)

        if header is not None:
            # Write successfully converted sequences
            pass_count += 1
            seq.id = seq.name = flattenAnnotation(header, out_args['delimiter'])
            seq.description = ''
            try:
                SeqIO.write(seq, pass_handle, out_args['out_type'])
            except AttributeError:
                # Open output file
                pass_handle = _open('pass')
                SeqIO.write(seq, pass_handle, out_args['out_type'])
        else:
            fail_count += 1
            if out_args['failed']:
                # Write unconverted sequences
                try:
                    SeqIO.write(seq, fail_handle, out_args['out_type'])
                except AttributeError:
                    # Open output file
                    pass_handle = _open('fail')
                    SeqIO.write(seq, fail_handle, out_args['out_type'])

    # Print counts
    printProgress(seq_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None
    log['SEQUENCES'] = seq_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'ConvertHeaders'
    printLog(log)

    # Close file handles
    if fail_handle is not None:  pass_handle.close()
    if fail_handle is not None:  fail_handle.close()

    return pass_handle.name
Ejemplo n.º 21
0
def modifyHeaders(seq_file, modify_func, modify_args, out_args=default_out_args):
    """
    Modifies sequence headers

    Arguments: 
    seq_file = the sequence file name
    modify_func = the function defining the modification operation
    modify_args = a dictionary of arguments to pass to modify_func
    out_args = common output argument dictionary from parseCommonArgs
                    
    Returns: 
    the output file name
    """
    # Define subcommand label dictionary
    cmd_dict = {addHeader: 'add',
                copyHeader: 'copy',
                collapseHeader: 'collapse',
                deleteHeader: 'delete',
                expandHeader: 'expand',
                renameHeader: 'rename'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'ParseHeaders'
    log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__)
    log['FILE'] = os.path.basename(seq_file)
    for k in sorted(modify_args):  
        v = modify_args[k]
        log[k.upper()] = ','.join(v) if isinstance(v, list) else v
    printLog(log)
    
    # Open file handles
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type
    out_handle = getOutputHandle(seq_file, 'reheader', out_dir=out_args['out_dir'],
                                 out_name=out_args['out_name'], out_type=out_args['out_type'])

    # Count records
    result_count = countSeqFile(seq_file)
    
    # Iterate over sequences
    start_time = time()
    seq_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, result_count, 0.05, start_time)
        
        #Update counts
        seq_count += 1
        
        # Modify header
        header = parseAnnotation(seq.description, delimiter=out_args['delimiter'])
        header = modify_func(header, delimiter=out_args['delimiter'], **modify_args)
        
        # Write new sequence
        seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter'])
        seq.description = ''
        SeqIO.write(seq, out_handle, out_args['out_type'])
        
    # print counts
    printProgress(seq_count, result_count, 0.05, start_time)    
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['SEQUENCES'] = seq_count
    log['END'] = 'ParseHeaders'               
    printLog(log)

    # Close file handles
    out_handle.close()
 
    return out_handle.name
Ejemplo n.º 22
0
def tableHeaders(seq_file, fields, out_file=None, out_args=default_out_args):
    """
    Builds a table of sequence header annotations

    Arguments: 
      seq_file : the sequence file name.
      fields : the list of fields to output.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
                    
    Returns: 
      str: output table file name
    """
    log = OrderedDict()
    log['START'] = 'ParseHeaders'
    log['COMMAND'] = 'table'
    log['FILE'] = os.path.basename(seq_file)
    printLog(log)
    
    # Open file handles
    seq_iter = readSeqFile(seq_file)
    if out_file is not None:
        out_handle = open(out_file, 'w')
    else:
        out_handle = getOutputHandle(seq_file,
                                     'headers',
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type='tab')
    # Count records
    result_count = countSeqFile(seq_file)
    
    # Open csv writer and write header
    out_writer = csv.DictWriter(out_handle, extrasaction='ignore', restval='', 
                                delimiter='\t', fieldnames=fields)
    out_writer.writeheader()
    
    # Iterate over sequences
    start_time = time()
    seq_count = pass_count = fail_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, result_count, 0.05, start_time=start_time)
        
        # Get annotations
        seq_count += 1
        ann = parseAnnotation(seq.description, fields, delimiter=out_args['delimiter'])

        # Write records
        if ann:
            pass_count += 1
            out_writer.writerow(ann)
        else:
            fail_count += 1
        
    # Print counts
    printProgress(seq_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['SEQUENCES'] = seq_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'ParseHeaders'
    printLog(log)

    # Close file handles
    out_handle.close()
 
    return out_handle.name
Ejemplo n.º 23
0
def pairSeq(seq_file_1, seq_file_2, fields_1=None, fields_2=None,
            coord_type=default_coord_type,
            out_args=default_out_args):
    """
    Generates consensus sequences

    Arguments: 
    seq_file_1 = the file containing the grouped sequences and annotations
    seq_file_2 = the file to assign annotations to from seq_file_1
    fields_1 = list of annotations in seq_file_1 records to copy to seq_file_2 records;
               if None do not copy any annotations
    fields_2 = list of annotations in seq_file_2 records to copy to seq_file_1 records;
               if None do not copy any annotations
    coord_type = the sequence header format
    out_args = common output argument dictionary from parseCommonArgs
                    
    Returns: 
    a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2)
    """
    # Define private functions
    def _key_func(x):
        return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter'])

    log = OrderedDict()
    log['START'] = 'PairSeq'
    log['FILE1'] = os.path.basename(seq_file_1)
    log['FILE2'] = os.path.basename(seq_file_2)
    log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None
    log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None
    log['COORD_TYPE'] = coord_type
    printLog(log)

    # Define output type
    if out_args['out_type'] is None:
        out_type_1 = getFileType(seq_file_1)
        out_type_2 = getFileType(seq_file_2)
    else: 
        out_type_1 = out_type_2 = out_args['out_type']

    # Define output name
    if out_args['out_name'] is None:
        out_name_1 = out_name_2 = None
    else: 
        out_name_1 = '%s-1' % out_args['out_name']
        out_name_2 = '%s-2' % out_args['out_name']

    # Open and count files
    start_time = time()
    printMessage("Indexing files", start_time=start_time)
    # Index file 1
    seq_count_1 = countSeqFile(seq_file_1)
    seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func)
    # Define file 2 iterator
    seq_count_2 = countSeqFile(seq_file_2)
    seq_iter_2 = readSeqFile(seq_file_2, index=False)
    printMessage("Done", start_time=start_time, end=True)

    # Open output file handles
    pass_handle_1 = getOutputHandle(seq_file_1, 'pair-pass', out_args['out_dir'], 
                                    out_name=out_name_1, out_type=out_type_1)
    pass_handle_2 = getOutputHandle(seq_file_2, 'pair-pass', out_args['out_dir'], 
                                    out_name=out_name_2, out_type=out_type_2)

    if out_args['failed']:
        fail_handle_1 = getOutputHandle(seq_file_1, 'pair-fail', out_dir=out_args['out_dir'],
                                        out_name=out_name_1, out_type=out_type_1)
        fail_handle_2 = getOutputHandle(seq_file_2, 'pair-fail', out_dir=out_args['out_dir'],
                                        out_name=out_name_2, out_type=out_type_2)
        pass_keys = list()

    # Iterate over pairs and write to output files
    start_time = time()
    rec_count = pair_count = 0
    for seq_2 in seq_iter_2:
        # Print progress for previous iteration
        printProgress(rec_count, seq_count_2, 0.05, start_time)
        rec_count += 1

        # Check for file 2 mate pair in file 1
        coord_2 = getCoordKey(seq_2.id, coord_type=coord_type,
                              delimiter=out_args['delimiter'])
        seq_1 = seq_dict_1.get(coord_2, None)

        if seq_1 is not None:
            # Record paired keys
            pair_count += 1

            if fields_1 is not None or fields_2 is not None:
                ann_1 = parseAnnotation(seq_1.description,
                                        delimiter=out_args['delimiter'])
                ann_2 = parseAnnotation(seq_2.description,
                                        delimiter=out_args['delimiter'])

                # Prepend annotations from seq_1 to seq_2
                if fields_1 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \
                                            if k in fields_1])
                    merge_ann = mergeAnnotation(ann_2, copy_ann, prepend=True,
                                                delimiter=out_args['delimiter'])
                    seq_2.id = flattenAnnotation(merge_ann,
                                                 delimiter=out_args['delimiter'])
                    seq_2.description = ''

                # Append annotations from seq_2 to seq_1
                if fields_2 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \
                                            if k in fields_2])
                    merge_ann = mergeAnnotation(ann_1, copy_ann, prepend=False,
                                                delimiter=out_args['delimiter'])
                    seq_1.id = flattenAnnotation(merge_ann,
                                                 delimiter=out_args['delimiter'])
                    seq_1.description = ''

            # Write paired records
            SeqIO.write(seq_1, pass_handle_1, out_type_1)
            SeqIO.write(seq_2, pass_handle_2, out_type_2)

        # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records
        if out_args['failed']:
            if seq_1 is not None:  pass_keys.append(coord_2)
            else:  SeqIO.write(seq_2, fail_handle_2, out_type_2)

    # Print final progress
    printProgress(rec_count, seq_count_2, 0.05, start_time)

    # Find and write unpaired file 1 records
    if out_args['failed']:
        start_time = time()
        printMessage("Finding unpaired", start_time=start_time)

        # Find file 1 unpaired keys
        pass_keys = set(pass_keys)
        unpaired = set(seq_dict_1).difference(pass_keys)
        # Write unpaired file 1 records
        for k in unpaired:  SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1)

        printMessage("Done", start_time=start_time, end=True)

    # Print log
    log = OrderedDict()
    log['OUTPUT1'] = os.path.basename(pass_handle_1.name)
    log['OUTPUT2'] = os.path.basename(pass_handle_2.name)
    log['SEQUENCES1'] = seq_count_1
    log['SEQUENCES2'] = seq_count_2
    log['PASS'] = pair_count
    log['END'] = 'PairSeq'
    printLog(log)
   
    # Close file handles
    pass_handle_1.close()
    pass_handle_2.close()

    return [(pass_handle_1.name, pass_handle_2.name)]
Ejemplo n.º 24
0
def modifyHeaders(seq_file, modify_func, modify_args, out_file=None, out_args=default_out_args):
    """
    Modifies sequence headers

    Arguments: 
      seq_file : the sequence file name.
      modify_func : the function defining the modification operation.
      modify_args : a dictionary of arguments to pass to modify_func.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
                    
    Returns: 
      str: output file name.
    """
    # Define subcommand label dictionary
    cmd_dict = {addHeader: 'add',
                copyHeader: 'copy',
                collapseHeader: 'collapse',
                deleteHeader: 'delete',
                expandHeader: 'expand',
                renameHeader: 'rename'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'ParseHeaders'
    log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__)
    log['FILE'] = os.path.basename(seq_file)
    for k in sorted(modify_args):  
        v = modify_args[k]
        log[k.upper()] = ','.join(v) if isinstance(v, list) else v
    printLog(log)
    
    # Open file handles
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type
    if out_file is not None:
        out_handle = open(out_file, 'w')
    else:
        out_handle = getOutputHandle(seq_file,
                                     'reheader',
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type=out_args['out_type'])
    # Count records
    result_count = countSeqFile(seq_file)

    # Iterate over sequences
    start_time = time()
    seq_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, result_count, 0.05, start_time=start_time)
        
        #Update counts
        seq_count += 1
        
        # Modify header
        header = parseAnnotation(seq.description, delimiter=out_args['delimiter'])
        header = modify_func(header, delimiter=out_args['delimiter'], **modify_args)
        
        # Write new sequence
        seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter'])
        seq.description = ''
        SeqIO.write(seq, out_handle, out_args['out_type'])
        
    # print counts
    printProgress(seq_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['SEQUENCES'] = seq_count
    log['END'] = 'ParseHeaders'               
    printLog(log)

    # Close file handles
    out_handle.close()
 
    return out_handle.name
def convertHeaders(seq_file, convert_func, convert_args={}, out_args=default_out_args):
    """
    Converts sequence headers to the pRESTO format

    Arguments:
    seq_file = the sequence file name
    convert_func = the function used to convert sequence headers
    convert_args = a dictionary of arguments to pass to convert_func
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    the output sequence file name
    """
    # Define subcommand label dictionary
    cmd_dict = {convertGenericHeader:'generic',
                convert454Header:'454',
                convertGenbankHeader:'genbank',
                convertIlluminaHeader:'illumina',
                convertIMGTHeader:'imgt',
                convertSRAHeader:'sra'}

    log = OrderedDict()
    log['START'] = 'ConvertHeaders'
    log['COMMAND'] = cmd_dict[convert_func]
    log['FILE'] = os.path.basename(seq_file)
    printLog(log)

    # Open input file
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type

    # Count records
    result_count = countSeqFile(seq_file)

    # Open output file handles
    pass_handle = getOutputHandle(seq_file,
                                  'convert-pass',
                                  out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'],
                                  out_type=out_args['out_type'])
    if out_args['failed']:
        fail_handle = getOutputHandle(seq_file,
                                      'convert-fail',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
    else:
        fail_handle = None

    # Set additional conversion arguments
    if convert_func in [convertGenericHeader, convertGenbankHeader]:
        convert_args.update({'delimiter':out_args['delimiter']})

    # Iterate over sequences
    start_time = time()
    seq_count = pass_count = fail_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration and update count
        printProgress(seq_count, result_count, 0.05, start_time)
        seq_count += 1

        # Convert header
        header = convert_func(seq.description, **convert_args)

        if header is not None:
            # Write successfully converted sequences
            pass_count += 1
            seq.id = seq.name = flattenAnnotation(header, out_args['delimiter'])
            seq.description = ''
            SeqIO.write(seq, pass_handle, out_args['out_type'])
        else:
            fail_count += 1
            if fail_handle is not None:
                # Write successfully unconverted sequences
                SeqIO.write(seq, fail_handle, out_args['out_type'])

    # Print counts
    printProgress(seq_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['SEQUENCES'] = seq_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'ConvertHeaders'
    printLog(log)

    # Close file handles
    pass_handle.close()
    if fail_handle is not None:  fail_handle.close()

    return pass_handle.name
Ejemplo n.º 26
0
def pairSeq(seq_file_1,
            seq_file_2,
            fields_1=None,
            fields_2=None,
            action=None,
            coord_type=default_coord,
            out_args=default_out_args):
    """
    Syncronized paired end files and copies annotations between them

    Arguments: 
      seq_file_1 : the file containing the grouped sequences and annotations.
      seq_file_2 : the file to assign annotations to from seq_file_1.
      fields_1 : list of annotations in seq_file_1 records to copy to seq_file_2 records;
                 if None do not copy any annotations.
      fields_2 : list of annotations in seq_file_2 records to copy to seq_file_1 records;
                 if None do not copy any annotations.
      action : the collapse action to take on all copied annotation if they already exist in the
               target header.
      coord_type : the sequence header format.
      out_args : common output argument dictionary from parseCommonArgs.
                    
    Returns: 
      list: a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2).
    """

    # Define private functions
    def _key_func(x):
        return getCoordKey(x,
                           coord_type=coord_type,
                           delimiter=out_args['delimiter'])

    log = OrderedDict()
    log['START'] = 'PairSeq'
    log['FILE1'] = os.path.basename(seq_file_1)
    log['FILE2'] = os.path.basename(seq_file_2)
    log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None
    log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None
    log['COORD_TYPE'] = coord_type
    printLog(log)

    # Define output type
    if out_args['out_type'] is None:
        out_type_1 = getFileType(seq_file_1)
        out_type_2 = getFileType(seq_file_2)
    else:
        out_type_1 = out_type_2 = out_args['out_type']

    # Define output name
    if out_args['out_name'] is None:
        out_name_1 = out_name_2 = None
    else:
        out_name_1 = '%s-1' % out_args['out_name']
        out_name_2 = '%s-2' % out_args['out_name']

    # Open and count files
    start_time = time()
    printMessage("Indexing files", start_time=start_time)
    # Index file 1
    seq_count_1 = countSeqFile(seq_file_1)
    seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func)
    # Define file 2 iterator
    seq_count_2 = countSeqFile(seq_file_2)
    seq_iter_2 = readSeqFile(seq_file_2, index=False)
    printMessage("Done", start_time=start_time, end=True)

    # Open output file handles
    pass_handle_1 = getOutputHandle(seq_file_1,
                                    'pair-pass',
                                    out_args['out_dir'],
                                    out_name=out_name_1,
                                    out_type=out_type_1)
    pass_handle_2 = getOutputHandle(seq_file_2,
                                    'pair-pass',
                                    out_args['out_dir'],
                                    out_name=out_name_2,
                                    out_type=out_type_2)

    if out_args['failed']:
        fail_handle_1 = getOutputHandle(seq_file_1,
                                        'pair-fail',
                                        out_dir=out_args['out_dir'],
                                        out_name=out_name_1,
                                        out_type=out_type_1)
        fail_handle_2 = getOutputHandle(seq_file_2,
                                        'pair-fail',
                                        out_dir=out_args['out_dir'],
                                        out_name=out_name_2,
                                        out_type=out_type_2)
        pass_keys = list()

    # Iterate over pairs and write to output files
    start_time = time()
    rec_count = pair_count = 0
    for seq_2 in seq_iter_2:
        # Print progress for previous iteration
        printProgress(rec_count, seq_count_2, 0.05, start_time=start_time)
        rec_count += 1

        # Check for file 2 mate pair in file 1
        coord_2 = getCoordKey(seq_2.id,
                              coord_type=coord_type,
                              delimiter=out_args['delimiter'])
        seq_1 = seq_dict_1.get(coord_2, None)

        if seq_1 is not None:
            # Record paired keys
            pair_count += 1

            if fields_1 is not None or fields_2 is not None:
                ann_1 = parseAnnotation(seq_1.description,
                                        delimiter=out_args['delimiter'])
                ann_2 = parseAnnotation(seq_2.description,
                                        delimiter=out_args['delimiter'])

                # Prepend annotations from seq_1 to seq_2
                if fields_1 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \
                                            if k in fields_1])
                    merge_ann = mergeAnnotation(
                        ann_2,
                        copy_ann,
                        prepend=True,
                        delimiter=out_args['delimiter'])
                    # Collapse if necessary
                    if action is not None:
                        merge_ann = collapseAnnotation(
                            merge_ann,
                            action,
                            fields=fields_1,
                            delimiter=out_args['delimiter'])
                    # Flatten
                    seq_2.id = flattenAnnotation(
                        merge_ann, delimiter=out_args['delimiter'])
                    seq_2.description = ''

                # Append annotations from seq_2 to seq_1
                if fields_2 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \
                                            if k in fields_2])
                    merge_ann = mergeAnnotation(
                        ann_1,
                        copy_ann,
                        prepend=False,
                        delimiter=out_args['delimiter'])
                    # Collapse if necessary
                    if action is not None:
                        merge_ann = collapseAnnotation(
                            merge_ann,
                            action,
                            fields=fields_2,
                            delimiter=out_args['delimiter'])
                    # Flatten
                    seq_1.id = flattenAnnotation(
                        merge_ann, delimiter=out_args['delimiter'])
                    seq_1.description = ''

            # Write paired records
            SeqIO.write(seq_1, pass_handle_1, out_type_1)
            SeqIO.write(seq_2, pass_handle_2, out_type_2)

        # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records
        if out_args['failed']:
            if seq_1 is not None: pass_keys.append(coord_2)
            else: SeqIO.write(seq_2, fail_handle_2, out_type_2)

    # Print final progress
    printProgress(rec_count, seq_count_2, 0.05, start_time=start_time)

    # Find and write unpaired file 1 records
    if out_args['failed']:
        start_time = time()
        printMessage("Finding unpaired", start_time=start_time)

        # Find file 1 unpaired keys
        pass_keys = set(pass_keys)
        unpaired = set(seq_dict_1).difference(pass_keys)
        # Write unpaired file 1 records
        for k in unpaired:
            SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1)

        printMessage("Done", start_time=start_time, end=True)

    # Print log
    log = OrderedDict()
    log['OUTPUT1'] = os.path.basename(pass_handle_1.name)
    log['OUTPUT2'] = os.path.basename(pass_handle_2.name)
    log['SEQUENCES1'] = seq_count_1
    log['SEQUENCES2'] = seq_count_2
    log['PASS'] = pair_count
    log['END'] = 'PairSeq'
    printLog(log)

    # Close file handles
    pass_handle_1.close()
    pass_handle_2.close()

    return [(pass_handle_1.name, pass_handle_2.name)]