Example #1
0
def countSeqSets(seq_file, field=default_barcode_field, delimiter=default_delimiter):
    """
    Identifies sets of sequences with the same ID field

    Arguments:
      seq_file : FASTA or FASTQ file containing sample sequences
      field : Annotation field containing set IDs
      delimiter : Tuple of delimiters for (fields, values, value lists)

    Returns:
      int : Count of unit set IDs in the sequence file
    """
    # Count records and check file
    try:
        id_set = set()
        for seq in readSeqFile(seq_file):
            id_set.add(parseAnnotation(seq.description, delimiter=delimiter)[field])
        result_count = len(id_set)
    except IOError:
        printError('File %s cannot be read.' % seq_file)
    except Exception as e:
        printError('File %s is invalid with exception %s.' % (seq_file, e))
    else:
        if result_count == 0:  printError('File %s is empty.' % seq_file)

    return result_count
Example #2
0
 def _header(seq,
             cluster,
             field=cluster_field,
             prefix=cluster_prefix,
             delimiter=out_args['delimiter']):
     label = '%s%i' % (prefix, cluster)
     header = parseAnnotation(seq.description, delimiter=delimiter)
     header = mergeAnnotation(header, {field: label}, delimiter=delimiter)
     seq.id = seq.name = flattenAnnotation(header, delimiter=delimiter)
     seq.description = ''
     return seq
def offsetSeqSet(seq_list, offset_dict, field=default_primer_field, 
                 mode='pad', delimiter=default_delimiter):
    """
    Pads the head of a set of sequences with gaps according to an offset list

    Arguments: 
    seq_list = a list of SeqRecord objects to offset
    offset_dict = a dictionary of {set ID: offset values}
    field = the field in sequence description containing set IDs
    mode = defines the action taken; one of 'pad','cut'
    delimiter = a tuple of delimiters for (annotations, field/values, value lists)
        
    Returns: 
    a MultipleSeqAlignment object containing the alignment
    """
    ann_list = [parseAnnotation(s.description, delimiter=delimiter) for s in seq_list]
    tag_list = [a[field] for a in ann_list]

    # Pad sequences with offsets
    align_list = []
    if mode == 'pad':
        max_len = max([len(s) + offset_dict[t] 
                  for s, t in zip(seq_list, tag_list)])
        for rec, tag in zip(seq_list, tag_list):
            new_rec = rec[:]
            new_rec.letter_annotations = {}
            new_rec.seq = '-' * offset_dict[tag] + new_rec.seq
            new_rec.seq += '-' * (max_len - len(new_rec.seq))
            align_list.append(new_rec)
    # Cut sequences to common start position
    elif mode == 'cut':
        max_offset = max(offset_dict.values())
        cut_dict = {k:(max_offset - v) for k, v in offset_dict.items()}
        max_len = max([len(s) - cut_dict[t] 
                  for s, t in zip(seq_list, tag_list)])
        for rec, tag in zip(seq_list, tag_list):
            new_rec = rec[:]
            new_rec.letter_annotations = {}
            new_rec.seq = new_rec.seq[cut_dict[tag]:]
            new_rec.seq += '-' * (max_len - len(new_rec.seq))
            align_list.append(new_rec)
    else:
        exit('offsettSeqList error:  invalid offset mode')

    # Convert list to MultipleSeqAlignment object
    align = MultipleSeqAlignment(align_list)
    
    return align
Example #4
0
def offsetSeqSet(seq_list, offset_dict, field=default_primer_field, 
                 mode='pad', delimiter=default_delimiter):
    """
    Pads the head of a set of sequences with gaps according to an offset list

    Arguments: 
      seq_list : a list of SeqRecord objects to offset
      offset_dict : a dictionary of {set ID: offset values}
      field : the field in sequence description containing set IDs
      mode : defines the action taken; one of 'pad','cut'
      delimiter : a tuple of delimiters for (annotations, field/values, value lists)
        
    Returns: 
      Bio.Align.MultipleSeqAlignment: object containing the alignment.
    """
    ann_list = [parseAnnotation(s.description, delimiter=delimiter) for s in seq_list]
    tag_list = [a[field] for a in ann_list]

    # Pad sequences with offsets
    align_list = []
    if mode == 'pad':
        max_len = max([len(s) + offset_dict[t] 
                  for s, t in zip(seq_list, tag_list)])
        for rec, tag in zip(seq_list, tag_list):
            new_rec = rec[:]
            new_rec.letter_annotations = {}
            new_rec.seq = '-' * offset_dict[tag] + new_rec.seq
            new_rec.seq += '-' * (max_len - len(new_rec.seq))
            align_list.append(new_rec)
    # Cut sequences to common start position
    elif mode == 'cut':
        max_offset = max(offset_dict.values())
        cut_dict = {k:(max_offset - v) for k, v in offset_dict.items()}
        max_len = max([len(s) - cut_dict[t] 
                  for s, t in zip(seq_list, tag_list)])
        for rec, tag in zip(seq_list, tag_list):
            new_rec = rec[:]
            new_rec.letter_annotations = {}
            new_rec.seq = new_rec.seq[cut_dict[tag]:]
            new_rec.seq += '-' * (max_len - len(new_rec.seq))
            align_list.append(new_rec)
    else:
        printError('Invalid offset mode.')

    # Convert list to MultipleSeqAlignment object
    align = MultipleSeqAlignment(align_list)
    
    return align
def indexSeqSets(seq_dict, field=default_barcode_field, delimiter=default_delimiter):
    """
    Identifies sets of sequences with the same ID field

    Arguments:
      seq_dict : a dictionary index of sequences returned from SeqIO.index()
      field : the annotation field containing set IDs
      delimiter : a tuple of delimiters for (fields, values, value lists)

    Returns:
      dict : Dictionary mapping set name to a list of record names
    """
    set_dict = {}
    for key, rec in seq_dict.items():
        tag = parseAnnotation(rec.description, delimiter=delimiter)[field]
        set_dict.setdefault(tag, []).append(key)

    return set_dict
def subsetSeqIndex(seq_dict, field, values, delimiter=default_delimiter):
    """
    Subsets a sequence set by annotation value

    Arguments:
      seq_dict : Dictionary index of sequences returned from SeqIO.index()
      field : Annotation field to select keys by
      values : List of annotation values that define the retained keys
      delimiter : Tuple of delimiters for (annotations, field/values, value lists)

    Returns:
      list : List of keys
    """
    # Parse annotations from seq_dict and subset keys
    key_subset = [k for k in seq_dict \
                  if parseAnnotation(seq_dict[k].description, delimiter=delimiter)[field] \
                  in values]

    return key_subset
def subsetSeqSet(seq_iter, field, values, delimiter=default_delimiter):
    """
    Subsets a sequence set by annotation value

    Arguments:
      seq_iter : Iterator or list of SeqRecord objects
      field : Annotation field to select by
      values : List of annotation values that define the retained sequences
      delimiter : Tuple of delimiters for (annotations, field/values, value lists)

    Returns:
      list : Modified list of SeqRecord objects
    """
    # Parse annotations from seq_list records
    ann_list = [parseAnnotation(s.description, delimiter=delimiter) for s in seq_iter]

    # Subset seq_list by annotation
    if not isinstance(values, list):  values = [values]
    seq_subset = [seq_iter[i] for i, a in enumerate(ann_list) if a[field] in values]

    return seq_subset
def convertGenericHeader(desc, delimiter=default_delimiter):
    """
    Converts any header to the pRESTO format

    Arguments:
    desc = a sequence description string
    delimiter = a tuple of delimiters for (fields, values, value lists)

    Returns:
    a dictionary of header {field: value} pairs
    """
    # Replace whitespace and delimiter characters
    sub_regex = '[%s\s]+' % re.escape(''.join(delimiter))
    conv = re.sub(sub_regex, '_', desc)
    try:
        # Check if modified header is valid
        header = parseAnnotation(conv, delimiter=delimiter)
    except:
        # Assign header to None if header cannot be converted
        header = None

    return header
Example #9
0
def groupSeqFile(seq_file, field, threshold=None, out_args=default_out_args):
    """
    Divides a sequence file into segments by description tags

    Arguments: 
      seq_file : filename of the sequence file to split
      field : The annotation field to split seq_file by
      threshold : The numerical threshold for group sequences by;
                  if None treat field as textual
      out_args : common output argument dictionary from parseCommonArgs

    Returns: 
      list: output file names
    """
    log = OrderedDict()
    log['START'] = 'SplitSeq'
    log['COMMAND'] = 'group'
    log['FILE'] = os.path.basename(seq_file)
    log['FIELD'] = field
    log['THRESHOLD'] = threshold
    printLog(log)

    # Open file handles
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None: out_args['out_type'] = in_type

    # Determine total numbers of records
    rec_count = countSeqFile(seq_file)

    # Process sequences
    start_time = time()
    seq_count = 0
    if threshold is None:
        # Sort records into files based on textual field
        # Create set of unique field tags
        temp_iter = readSeqFile(seq_file)
        tag_list = getAnnotationValues(temp_iter,
                                       field,
                                       unique=True,
                                       delimiter=out_args['delimiter'])

        if sys.platform != 'win32':
            import resource
            # Increase open file handle limit if needed
            file_limit = resource.getrlimit(resource.RLIMIT_NOFILE)[0]
            file_count = len(tag_list) + 256
            if file_limit < file_count and file_count <= 8192:
                #print file_limit, file_count
                resource.setrlimit(resource.RLIMIT_NOFILE,
                                   (file_count, file_count))
            elif file_count > 8192:
                e = '''OS file limit would need to be set to %i.
                    If you are sure you want to do this, then increase the 
                    file limit in the OS (via ulimit) and rerun this tool.
                    ''' % file_count
                printError(dedent(e))

        # Create output handles
        # out_label = '%s=%s' % (field, tag)
        handles_dict = {
            tag: getOutputHandle(seq_file,
                                 '%s-%s' % (field, tag),
                                 out_dir=out_args['out_dir'],
                                 out_name=out_args['out_name'],
                                 out_type=out_args['out_type'])
            for tag in tag_list
        }

        # Iterate over sequences
        for seq in seq_iter:
            printProgress(seq_count, rec_count, 0.05, start_time=start_time)
            seq_count += 1
            # Write sequences
            tag = parseAnnotation(seq.description,
                                  delimiter=out_args['delimiter'])[field]
            SeqIO.write(seq, handles_dict[tag], out_args['out_type'])
    else:
        # Sort records into files based on numeric threshold
        threshold = float(threshold)
        # Create output handles
        handles_dict = {
            'under':
            getOutputHandle(seq_file,
                            'under-%.1g' % threshold,
                            out_dir=out_args['out_dir'],
                            out_name=out_args['out_name'],
                            out_type=out_args['out_type']),
            'atleast':
            getOutputHandle(seq_file,
                            'atleast-%.1g' % threshold,
                            out_dir=out_args['out_dir'],
                            out_name=out_args['out_name'],
                            out_type=out_args['out_type'])
        }

        # Iterate over sequences
        for seq in seq_iter:
            printProgress(seq_count, rec_count, 0.05, start_time=start_time)
            seq_count += 1
            # Write sequences
            tag = parseAnnotation(seq.description,
                                  delimiter=out_args['delimiter'])[field]
            tag = 'under' if float(tag) < threshold else 'atleast'
            SeqIO.write(seq, handles_dict[tag], out_args['out_type'])

    # Print log
    printProgress(seq_count, rec_count, 0.05, start_time=start_time)
    log = OrderedDict()
    for i, k in enumerate(handles_dict):
        log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name)
    log['SEQUENCES'] = rec_count
    log['PARTS'] = len(handles_dict)
    log['END'] = 'SplitSeq'
    printLog(log)

    # Close output file handles
    for k in handles_dict:
        handles_dict[k].close()

    return [handles_dict[k].name for k in handles_dict]
Example #10
0
def sortSeqFile(seq_file,
                field,
                numeric=False,
                max_count=None,
                out_args=default_out_args):
    """
    Sorts a sequence file by annotation fields

    Arguments: 
      seq_file : filename of the sequence file to split
      field : position of field in sequence description to split by
      numeric : if True sort field numerically;
                if False sort field alphabetically
      max_count : maximum number of records in each output file
                  if None do not create multiple files
      out_args : common output argument dictionary from parseCommonArgs
    
    Returns: 
      list: output file names
    """
    log = OrderedDict()
    log['START'] = 'SplitSeq'
    log['COMMAND'] = 'sort'
    log['FILE'] = os.path.basename(seq_file)
    log['FIELD'] = field
    log['NUMERIC'] = numeric
    log['MAX_COUNT'] = max_count
    printLog(log)

    # Open file handles
    in_type = getFileType(seq_file)
    seq_dict = readSeqFile(seq_file, index=True)
    if out_args['out_type'] is None: out_args['out_type'] = in_type

    # Get annotations and sort seq_dict by annotation values
    tag_dict = {
        k: parseAnnotation(seq_dict[k].description,
                           delimiter=out_args['delimiter'])[field]
        for k in seq_dict
    }
    if numeric: tag_dict = {k: float(v or 0) for k, v in tag_dict.items()}
    sorted_keys = sorted(tag_dict, key=tag_dict.get)

    # Determine total numbers of records
    rec_count = len(seq_dict)
    if max_count >= rec_count: max_count = None

    # Open initial output file handles
    file_count = 1
    if max_count is None: out_label = 'sorted'
    else: out_label = 'sorted-part%06i' % file_count
    out_handle = getOutputHandle(seq_file,
                                 out_label,
                                 out_dir=out_args['out_dir'],
                                 out_name=out_args['out_name'],
                                 out_type=out_args['out_type'])
    out_files = [out_handle.name]

    # Loop through sorted sequence dictionary keys
    start_time = time()
    last_tag = None
    saved_keys = []
    seq_count = chunk_count = 0
    for key in sorted_keys:
        # Print progress for previous iteration and update count
        printProgress(seq_count, rec_count, 0.05, start_time=start_time)
        seq_count += 1

        # Write saved group of sequences when tag changes
        if last_tag is not None and tag_dict[key] != last_tag:
            # Open new output file if needed
            if max_count is not None and chunk_count + len(
                    saved_keys) > max_count:
                # Update partition counts
                file_count += 1
                chunk_count = 0
                # Open new file handle
                out_handle.close()
                out_handle = getOutputHandle(seq_file,
                                             'sorted-part%06i' % file_count,
                                             out_dir=out_args['out_dir'],
                                             out_name=out_args['out_name'],
                                             out_type=out_args['out_type'])
                # Append output file name to out_files
                out_files.append(out_handle.name)

            # Write saved sequences
            for k in saved_keys:
                chunk_count += 1
                SeqIO.write(seq_dict[k], out_handle, out_args['out_type'])
            # Reset saved keys to current key only
            saved_keys = [key]
        else:
            # Update list of saved keys if tag is unchanged
            saved_keys.append(key)

        # Check if total records reached, write all saved keys, and exit loop
        if seq_count == rec_count:
            for k in saved_keys:
                chunk_count += 1
                SeqIO.write(seq_dict[k], out_handle, out_args['out_type'])
            out_handle.close()
            break

        # Update tag tracker
        last_tag = tag_dict[key]

    # Print log
    printProgress(seq_count, rec_count, 0.05, start_time=start_time)
    log = OrderedDict()
    for i, f in enumerate(out_files):
        log['OUTPUT%i' % (i + 1)] = os.path.basename(f)
    log['SEQUENCES'] = seq_count
    log['PARTS'] = len(out_files)
    log['END'] = 'SplitSeq'
    printLog(log)

    # Close file handles
    out_handle.close()

    return out_files
def getMaskedSeq(align, mode='mask', barcode=False, delimiter=default_delimiter):
    """
    Create an output sequence with primers masked or cut

    Arguments: 
    align = a PrimerAlignment object returned from alignPrimers or scorePrimers
    mode = defines the action taken; one of ['cut','mask','tag','trim']
    barcode = if True add sequence preceding primer to description
    delimiter = a tuple of delimiters for (annotations, field/values, value lists) 

    Returns:
    output SeqRecord object
    """
    seq = align.seq

    # Build output sequence
    if mode == 'tag' or not align.align_primer:
        # Do not modify sequence
        out_seq = seq
    elif mode == 'trim':
        # Remove region before primer
        if not align.rev_primer:
            out_seq = seq[align.start:]
        else:  
            out_seq = seq[:align.end]
    elif mode == 'cut':
        # Remove primer and preceding region
        if not align.rev_primer:
            out_seq = seq[align.end:]
        else: 
            out_seq = seq[:align.start]
    elif mode == 'mask':
        # Mask primer with Ns and remove preceding region
        if not align.rev_primer:
            mask_len = align.end - align.start + align.gaps
            out_seq = 'N' * mask_len + seq[align.end:]
            if hasattr(seq, 'letter_annotations') and \
                    'phred_quality' in seq.letter_annotations:
                out_seq.letter_annotations['phred_quality'] = \
                    [0] * mask_len + \
                    seq.letter_annotations['phred_quality'][align.end:]
        else:
            mask_len = min(align.end, len(seq)) - align.start + align.gaps
            out_seq = seq[:align.start] + 'N' * mask_len
            if hasattr(seq, 'letter_annotations') and \
                    'phred_quality' in seq.letter_annotations:
                out_seq.letter_annotations['phred_quality'] = \
                    seq.letter_annotations['phred_quality'][:align.start] + \
                    [0] * mask_len
            
    # Add alignment annotations to output SeqRecord
    out_seq.annotations = seq.annotations    
    out_seq.annotations['primer'] = align.primer
    out_seq.annotations['prstart'] = align.start
    out_seq.annotations['error'] = align.error

    # Parse seq annotation and create output annotation
    seq_ann = parseAnnotation(seq.description, delimiter=delimiter)
    out_ann = OrderedDict([('SEQORIENT', seq.annotations['seqorient']),
                           ('PRIMER', align.primer)])
    
    # Add ID sequence to description
    if barcode:
        seq_code = seq[:align.start].seq if not align.rev_primer \
                   else seq[align.end:].seq
        out_seq.annotations['barcode'] = seq_code
        out_ann['BARCODE'] = seq_code
    
    out_ann = mergeAnnotation(seq_ann, out_ann, delimiter=delimiter)
    out_seq.id = flattenAnnotation(out_ann, delimiter=delimiter)
    out_seq.description = ''

    return out_seq
Example #12
0
def modifyHeaders(seq_file, modify_func, modify_args, out_file=None, out_args=default_out_args):
    """
    Modifies sequence headers

    Arguments: 
      seq_file : the sequence file name.
      modify_func : the function defining the modification operation.
      modify_args : a dictionary of arguments to pass to modify_func.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
                    
    Returns: 
      str: output file name.
    """
    # Define subcommand label dictionary
    cmd_dict = {addHeader: 'add',
                copyHeader: 'copy',
                collapseHeader: 'collapse',
                deleteHeader: 'delete',
                expandHeader: 'expand',
                renameHeader: 'rename'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'ParseHeaders'
    log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__)
    log['FILE'] = os.path.basename(seq_file)
    for k in sorted(modify_args):  
        v = modify_args[k]
        log[k.upper()] = ','.join(v) if isinstance(v, list) else v
    printLog(log)
    
    # Open file handles
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type
    if out_file is not None:
        out_handle = open(out_file, 'w')
    else:
        out_handle = getOutputHandle(seq_file,
                                     'reheader',
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type=out_args['out_type'])
    # Count records
    result_count = countSeqFile(seq_file)

    # Iterate over sequences
    start_time = time()
    seq_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, result_count, 0.05, start_time=start_time)
        
        #Update counts
        seq_count += 1
        
        # Modify header
        header = parseAnnotation(seq.description, delimiter=out_args['delimiter'])
        header = modify_func(header, delimiter=out_args['delimiter'], **modify_args)
        
        # Write new sequence
        seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter'])
        seq.description = ''
        SeqIO.write(seq, out_handle, out_args['out_type'])
        
    # print counts
    printProgress(seq_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['SEQUENCES'] = seq_count
    log['END'] = 'ParseHeaders'               
    printLog(log)

    # Close file handles
    out_handle.close()
 
    return out_handle.name
Example #13
0
def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True,
            score_fields=False, region_fields=False, out_args=default_out_args):
    """
    Writes tab-delimited database file in output directory
    
    Arguments:
    db_gen = a generator of IgRecord objects containing alignment data
    file_prefix = directory and prefix for CLIP tab-delim file
    total_count = number of records (for progress bar)
    id_dict = a dictionary of {IMGT ID: full seq description}
    no_parse = if ID is to be parsed for pRESTO output with default delimiters
    score_fields = if True add alignment score fields to output file
    region_fields = if True add FWR and CDR region fields to output file
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    None
    """
    pass_file = "%s_db-pass.tab" % file_prefix
    fail_file = "%s_db-fail.tab" % file_prefix
    ordered_fields = ['SEQUENCE_ID',
                      'SEQUENCE_INPUT',
                      'FUNCTIONAL',
                      'IN_FRAME',
                      'STOP',
                      'MUTATED_INVARIANT',
                      'INDELS',
                      'V_CALL',
                      'D_CALL',
                      'J_CALL',
                      'SEQUENCE_VDJ',
                      'SEQUENCE_IMGT',
                      'V_SEQ_START',
                      'V_SEQ_LENGTH',
                      'V_GERM_START_VDJ',
                      'V_GERM_LENGTH_VDJ',
                      'V_GERM_START_IMGT',
                      'V_GERM_LENGTH_IMGT',
                      'N1_LENGTH',
                      'D_SEQ_START',
                      'D_SEQ_LENGTH',
                      'D_GERM_START',
                      'D_GERM_LENGTH',
                      'N2_LENGTH',
                      'J_SEQ_START',
                      'J_SEQ_LENGTH',
                      'J_GERM_START',
                      'J_GERM_LENGTH',
                      'JUNCTION_LENGTH',
                      'JUNCTION']

    if score_fields:
        ordered_fields.extend(['V_SCORE',
                               'V_IDENTITY',
                               'V_EVALUE',
                               'V_BTOP',
                               'J_SCORE',
                               'J_IDENTITY',
                               'J_EVALUE',
                               'J_BTOP'])

    if region_fields:
        ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT',
                               'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT'])


    # TODO:  This is not the best approach. should pass in output fields.
    # Initiate passed handle
    pass_handle = None

    # Open failed file
    if out_args['failed']:
        fail_handle = open(fail_file, 'wt')
        fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT'])
    else:
        fail_handle = None
        fail_writer = None

    # Initialize counters and file
    pass_writer = None
    start_time = time()
    rec_count = pass_count = fail_count = 0
    for record in db_gen:
        #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
        printProgress(rec_count, total_count, 0.05, start_time)
        rec_count += 1

        # Count pass or fail
        if (record.v_call == 'None' and record.j_call == 'None') or \
                record.functional is None or \
                not record.seq_vdj or \
                not record.junction:
            # print(record.v_call, record.j_call, record.functional, record.junction)
            fail_count += 1
            if fail_writer is not None: fail_writer.writerow(record.toDict())
            continue
        else: 
            pass_count += 1
            
        # Build sample sequence description
        if record.id in id_dict:
            record.id = id_dict[record.id]

        # Parse sequence description into new columns
        if not no_parse:
            record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
            record.id = record.annotations['ID']
            del record.annotations['ID']

        # TODO:  This is not the best approach. should pass in output fields.
        # If first sequence, use parsed description to create new columns and initialize writer
        if pass_writer is None:
            if not no_parse:  ordered_fields.extend(list(record.annotations.keys()))
            pass_handle = open(pass_file, 'wt')
            pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields)

        # Write row to tab-delim CLIP file
        pass_writer.writerow(record.toDict())
    
    # Print log
    #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
    printProgress(rec_count, total_count, 0.05, start_time)

    log = OrderedDict()
    log['OUTPUT'] = pass_file
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'MakeDb'
    printLog(log)
    
    if pass_handle is not None: pass_handle.close()
    if fail_handle is not None: fail_handle.close()
Example #14
0
 def _barcode(seq, field=barcode_field, delimiter=out_args['delimiter']):
     header = parseAnnotation(seq.description, delimiter=delimiter)
     return SeqRecord(Seq(header[field]), id=seq.id)
def processASQueue(alive, data_queue, result_queue, align_func, align_args={}, 
                      calc_div=False, delimiter=default_delimiter):
    """
    Pulls from data queue, performs calculations, and feeds results queue

    Arguments: 
    alive = a multiprocessing.Value boolean controlling whether processing 
            continues; when False function returns
    data_queue = a multiprocessing.Queue holding data to process
    result_queue = a multiprocessing.Queue to hold processed results
    align_func = the function to use for alignment
    align_args = a dictionary of optional arguments for the alignment function
    calc_div = if True perform diversity calculation
    delimiter = a tuple of delimiters for (annotations, field/values, value lists)

    Returns: 
    None
    """
    try:
        # Iterator over data queue until sentinel object reached
        while alive.value:
            # Get data from queue
            if data_queue.empty():  continue
            else:  data = data_queue.get()
            # Exit upon reaching sentinel
            if data is None:  break
            
            # Define result object
            result = SeqResult(data.id, data.data)
            result.log['BARCODE'] = data.id
            result.log['SEQCOUNT'] = len(data)
    
            # Perform alignment
            seq_list = data.data
            align_list = align_func(seq_list, **align_args)
    
            # Process alignment
            if align_list is not None:
                # Calculate diversity
                if calc_div:
                    diversity = calculateDiversity(align_list)
                    result.log['DIVERSITY'] = diversity
                
                # Restore quality scores
                has_quality = hasattr(seq_list[0], 'letter_annotations') and \
                              'phred_quality' in seq_list[0].letter_annotations
                if has_quality:
                    qual_dict = {seq.id:seq.letter_annotations['phred_quality'] \
                                 for seq in seq_list}
                    for seq in align_list:
                        qual = deque(qual_dict[seq.id])
                        qual_new = [0 if c == '-' else qual.popleft() for c in seq.seq]
                        seq.letter_annotations['phred_quality'] = qual_new
    
                # Add alignment to log
                if 'field' in align_args:
                    for i, seq in enumerate(align_list):
                        ann = parseAnnotation(seq.description, delimiter=delimiter)
                        primer = ann[align_args['field']]
                        result.log['ALIGN%i:%s' % (i + 1, primer)] = seq.seq
                else:
                    for i, seq in enumerate(align_list):  
                        result.log['ALIGN%i' % (i + 1)] = seq.seq
                
                # Add alignment to results
                result.results = align_list
                result.valid = True
                        
            # Feed results to result queue
            result_queue.put(result)
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None
    except:
        alive.value = False
        sys.stderr.write('Error processing sequence set with ID: %s.\n' % data.id)
        raise
    
    return None
def tableHeaders(seq_file, fields, out_args=default_out_args):
    """
    Builds a table of sequence header annotations

    Arguments: 
    seq_file = the sequence file name
    fields = the list of fields to output
    out_args = common output argument dictionary from parseCommonArgs
                    
    Returns: 
    the output table file name
    """
    log = OrderedDict()
    log['START'] = 'ParseHeaders'
    log['COMMAND'] = 'table'
    log['FILE'] = os.path.basename(seq_file)
    printLog(log)
    
    # Open file handles
    seq_iter = readSeqFile(seq_file)
    out_handle = getOutputHandle(seq_file, out_label='headers', out_dir=out_args['out_dir'], 
                                 out_name=out_args['out_name'], out_type='tab')
    # Count records
    result_count = countSeqFile(seq_file)
    
    # Open csv writer and write header
    out_writer = csv.DictWriter(out_handle, extrasaction='ignore', restval='', 
                                delimiter='\t', fieldnames=fields)
    out_writer.writeheader()
    
    # Iterate over sequences
    start_time = time()
    seq_count = pass_count = fail_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, result_count, 0.05, start_time)
        
        # Get annotations
        seq_count += 1
        ann = parseAnnotation(seq.description, fields, delimiter=out_args['delimiter'])

        # Write records
        if ann:
            pass_count += 1
            out_writer.writerow(ann)
        else:
            fail_count += 1
        
    # Print counts
    printProgress(seq_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['SEQUENCES'] = seq_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'ParseHeaders'
    printLog(log)

    # Close file handles
    out_handle.close()
 
    return out_handle.name
Example #17
0
def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None,
                copy_fields=None, copy_actions=None, max_field=None, min_field=None, 
                inner=False, keep_missing=False, out_file=None, out_args=default_out_args):
    """
    Removes duplicate sequences from a file

    Arguments: 
      seq_file : filename of the sequence file to sample from.
      max_missing : number of ambiguous characters to allow in a unique sequence.
      uniq_fields : a list of annotations that define a sequence as unique if they differ.
      copy_fields : a list of annotations to copy into unique sequence annotations.
      copy_actions : the list of collapseAnnotation actions to take on copy_fields.
      max_field : a numeric field whose maximum value determines the retained sequence.
      min_field : a numeric field whose minimum value determines the retained sequence.
      inner : if True exclude consecutive outer ambiguous characters from iterations and matching.
      keep_missing : if True retain sequences with more ambiguous characters than max_missing as unique.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
              
    Returns: 
      str: the collapsed output file name.
    """
    log = OrderedDict()
    log['START'] = 'CollapseSeq'
    log['FILE'] = os.path.basename(seq_file)
    log['MAX_MISSING'] = max_missing
    log['UNIQ_FIELDS'] = ','.join([str(x) for x in uniq_fields]) \
                         if uniq_fields is not None else None
    log['COPY_FIELDS'] = ','.join([str(x) for x in copy_fields]) \
                         if copy_fields is not None else None
    log['COPY_ACTIONS'] = ','.join([str(x) for x in copy_actions]) \
                          if copy_actions is not None else None
    log['MAX_FIELD'] = max_field
    log['MIN_FIELD'] = min_field
    log['INNER'] = inner
    log['KEEP_MISSING'] = keep_missing
    printLog(log)
    
    # Read input file
    in_type = getFileType(seq_file)
    seq_dict = SeqIO.to_dict(readSeqFile(seq_file, index=False))
    if out_args['out_type'] is None:  out_args['out_type'] = in_type

    # Count total sequences
    rec_count = len(seq_dict)

    # Open unique record output handle
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(seq_file,
                                      'collapse-unique',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
    # Define log handle
    if out_args['log_file'] is None:  
        log_handle = None
    else:  
        log_handle = open(out_args['log_file'], 'w')

    # Find sequences with duplicates
    uniq_dict = {}
    # Added list typing for compatibility issue with Python 2.7.5 on OS X
    # TypeError: object of type 'dictionary-keyiterator' has no len()
    search_keys = list(seq_dict.keys())
    dup_keys = []
    for n in range(0, max_missing + 1):
        # Find unique sequences
        uniq_dict, search_keys, dup_list = findUniqueSeq(uniq_dict, search_keys, seq_dict, n, 
                                                         uniq_fields, copy_fields,
                                                         max_field, min_field, inner, 
                                                         out_args['delimiter'])

        # Update list of duplicates
        dup_keys.extend(dup_list)
                
        # Break if no keys to search remain
        if len(search_keys) == 0:  break

    # Write unique sequences
    for val in uniq_dict.values():
        # Define output sequence
        out_seq = val.seq
        out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter'])
        out_app = OrderedDict()
        if copy_fields  is not None and copy_actions is not None:
            for f, a in zip(copy_fields, copy_actions):
                x = collapseAnnotation(val.annotations, a, f, delimiter=out_args['delimiter'])
                out_app[f] = x[f]
                out_ann.pop(f, None)
        out_app['DUPCOUNT'] = val.count
        out_ann = mergeAnnotation(out_ann, out_app, delimiter=out_args['delimiter'])
        out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter'])
        out_seq.description = ''
        # Write unique sequence
        SeqIO.write(out_seq, pass_handle, out_args['out_type'])

        # Update log
        log = OrderedDict()
        log['HEADER'] = out_seq.id
        log['DUPCOUNT'] = val.count
        for i, k in enumerate(val.keys, start=1):
            log['ID%i' % i] = k
        for i, k in enumerate(val.keys, start=1):
            log['SEQ%i' % i] = str(seq_dict[k].seq)
        printLog(log, handle=log_handle)

    # Write sequence with high missing character counts
    if keep_missing:
        for k in search_keys:
            out_seq = seq_dict[k]
            out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter'])
            out_ann = mergeAnnotation(out_ann, {'DUPCOUNT':1}, delimiter=out_args['delimiter'])
            out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter'])
            out_seq.description = ''
            SeqIO.write(out_seq, pass_handle, out_args['out_type'])

    # Write sequence with high missing character counts
    if out_args['failed'] and not keep_missing:
        with getOutputHandle(seq_file, 'collapse-undetermined', out_dir=out_args['out_dir'],
                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
                as missing_handle:
            for k in search_keys:
                SeqIO.write(seq_dict[k], missing_handle, out_args['out_type'])

    if out_args['failed']:
        # Write duplicate sequences 
        with getOutputHandle(seq_file, 'collapse-duplicate', out_dir=out_args['out_dir'], 
                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
                as dup_handle:
            for k in dup_keys:
                SeqIO.write(seq_dict[k], dup_handle, out_args['out_type'])

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['SEQUENCES'] = rec_count
    log['UNIQUE'] = len(uniq_dict)
    log['DUPLICATE'] = len(dup_keys)
    log['UNDETERMINED'] = len(search_keys)
    log['END'] = 'CollapseSeq'
    printLog(log)
        
    # Close file handles
    pass_file = pass_handle.name
    if pass_handle is not None:  pass_handle.close()
    if log_handle is not None:  log_handle.close()
    
    return pass_file
Example #18
0
def findUniqueSeq(uniq_dict, search_keys, seq_dict, max_missing=default_max_missing, 
                  uniq_fields=None, copy_fields=None, max_field=None, min_field=None, 
                  inner=False, delimiter=default_delimiter):
    """
    Finds unique sequences 

    Arguments: 
      uniq_dict : a dictionary of unique sequences generated by findUniqueSeq().
      search_keys : a list containing the subset of dictionary keys to be checked.
      seq_dict : a SeqRecords dictionary generated by SeqIO.index().
      max_missing : the number of missing characters to allow in a unique sequences.
      uniq_fields : a list of annotations that define a sequence as unique if they differ.
      copy_fields : a list of annotations to copy into unique sequence annotations.
      max_field : a numeric field whose maximum value determines the retained sequence.
      min_field : a numeric field whose minimum value determines the retained sequence.
      inner : if True exclude consecutive outer ambiguous characters from iterations and matching.
      delimiter : description field delimiter.
    
    Returns: 
      tuple: (uniq_dict, search_keys, dup_keys) modified from passed values.
    """
    # Define local variables
    ambig_re = re.compile(r'[\.\-N]')
    score = (max_missing > 0)
    dup_keys = []
    to_remove = []
    
    start_time = time()
    result_count = len(search_keys)
    # Iterate over search keys and update uniq_dict and dup_keys
    for idx, key in enumerate(search_keys):
        # Print progress of previous iteration
        printProgress(idx, result_count, 0.05, start_time=start_time, task='%i missing' % max_missing)
        
        # Define sequence to process
        seq = seq_dict[key]
        seq_str = str(seq.seq)
        if inner:  seq_str = seq_str.strip('.-N')
        
        # Skip processing of ambiguous sequences over max_missing threshold 
        ambig_count = len(ambig_re.findall(seq_str))
        if ambig_count > max_missing:  continue
        
        # Parse annotation and define unique identifiers (uid)
        if uniq_fields is not None:
            ann = parseAnnotation(seq_dict[key].description, uniq_fields, delimiter=delimiter)
            uid = tuple(chain([seq_str], list(ann.values())))             
        else:
            uid = (seq_str, None)

        # Parse annotation and define copied identifiers (cid)        
        if copy_fields is not None:
            ann = parseAnnotation(seq.description, copy_fields, delimiter=delimiter)
            cid = {k:[ann.get(k)] for k in copy_fields}
        else:
            cid = {}

        # Store new unique sequences and process duplicates
        match = findUID(uid, uniq_dict, score)
        if match is None:
            uniq_dict[uid] = DuplicateSet(seq, key=key, missing=ambig_count, annotations=cid)
        else:
            # Updated sequence, count, ambiguous character count, and count sets
            dup_key = key
            uniq_dict[match].count += 1
            uniq_dict[match].keys.append(key)
            for k, v in cid.items():
                uniq_dict[match].annotations[k].extend(v)
            # Check whether to replace previous unique sequence with current sequence
            if ambig_count <= uniq_dict[match].missing:
                swap = False
                seq_last = uniq_dict[match].seq
                if max_field is not None:
                    swap = float(parseAnnotation(seq.description, delimiter=delimiter)[max_field]) > \
                           float(parseAnnotation(seq_last.description, delimiter=delimiter)[max_field])
                elif min_field is not None:
                    swap = float(parseAnnotation(seq.description, delimiter=delimiter)[min_field]) > \
                           float(parseAnnotation(seq_last.description, delimiter=delimiter)[min_field])
                # TODO:  quality evaluation is a bottleneck
                else:
                    if hasattr(seq, 'letter_annotations') and 'phred_quality' in seq.letter_annotations:
                        q_this = float(sum(seq.letter_annotations['phred_quality'])) / len(seq)
                        q_last = float(sum(seq_last.letter_annotations['phred_quality'])) / len(seq_last)
                        swap = q_this > q_last
                # Replace old sequence if criteria passed
                if swap:
                    dup_key = seq_last.id
                    uniq_dict[match].seq = seq
                    uniq_dict[match].missing = ambig_count

            # Update duplicate list
            dup_keys.append(dup_key)

        # Mark seq for removal from later steps
        to_remove.append(idx)
        
    # Remove matched sequences from search_keys
    for j in reversed(to_remove):  del search_keys[j]

    # Update progress
    printProgress(result_count, result_count, 0.05, start_time=start_time, task='%i missing' % max_missing)
        
    return (uniq_dict, search_keys, dup_keys)
def findUniqueSeq(uniq_dict, search_keys, seq_dict, max_missing=default_max_missing, 
                  uniq_fields=None, copy_fields=None, max_field=None, min_field=None, 
                  inner=False, delimiter=default_delimiter):
    """
    Finds unique sequences 

    Arguments: 
    uniq_dict = a dictionary of unique sequences generated by findUniqueSeq()
    search_keys = a list containing the subset of dictionary keys to be checked
    seq_dict = a SeqRecords dictionary generated by SeqIO.index()
    max_missing = the number of missing characters to allow in a unique sequences
    uniq_fields = a list of annotations that define a sequence as unique if they differ
    copy_fields = a list of annotations to copy into unique sequence annotations
    max_field = a numeric field whose maximum value determines the retained sequence
    min_field = a numeric field whose minimum value determines the retained sequence
    inner = if True exclude consecutive outer ambiguous characters from iterations and matching
    delimiter = description field delimiter
    
    Returns: 
    a tuple of (uniq_dict, search_keys, dup_keys) modified from passed values
    """
    # Define local variables
    ambig_re = re.compile(r'[\.\-N]')
    score = (max_missing > 0)
    dup_keys = []
    to_remove = []
    
    start_time = time()
    result_count = len(search_keys)
    print('MISSING>  %i' % max_missing)
    # Iterate over search keys and update uniq_dict and dup_keys
    for idx, key in enumerate(search_keys):
        # Print progress of previous iteration
        printProgress(idx, result_count, 0.05, start_time)
        
        # Define sequence to process
        seq = seq_dict[key]
        seq_str = str(seq.seq)
        if inner:  seq_str = seq_str.strip('.-N')
        
        # Skip processing of ambiguous sequences over max_missing threshold 
        ambig_count = len(ambig_re.findall(seq_str))
        if ambig_count > max_missing:  continue
        
        # Parse annotation and define unique identifiers (uid)
        if uniq_fields is not None:
            ann = parseAnnotation(seq_dict[key].description, uniq_fields, delimiter=delimiter)
            uid = tuple(chain([seq_str], list(ann.values())))             
        else:
            uid = (seq_str, None)

        # Parse annotation and define copied identifiers (cid)        
        if copy_fields is not None:
            ann = parseAnnotation(seq.description, copy_fields, delimiter=delimiter)
            #print ann
            #cid = [[a] for a in ann.values()]
            cid = [[ann.get(k)] for k in copy_fields]
            #print cid
        else:
            cid = []

        # Store new unique sequences and process duplicates
        match = findUID(uid, uniq_dict, score)
        if match is None:
            uniq_dict[uid] = list(chain([seq, 1, ambig_count], cid))
        else:
            # Updated sequence, count, ambiguous character count, and count sets
            dup_key = key
            uniq_dict[match][1] += 1
            for x, c in enumerate(cid):
                uniq_dict[match][3 + x].extend(c)
            # Check whether to replace previous unique sequence with current sequence
            if ambig_count <= uniq_dict[match][2]:
                swap = False
                seq_last = uniq_dict[match][0]
                if max_field is not None:
                    swap = float(parseAnnotation(seq.description, delimiter=delimiter)[max_field]) > \
                           float(parseAnnotation(seq_last.description, delimiter=delimiter)[max_field])
                elif min_field is not None:
                    swap = float(parseAnnotation(seq.description, delimiter=delimiter)[min_field]) > \
                           float(parseAnnotation(seq_last.description, delimiter=delimiter)[min_field])
                # TODO:  quality evaluation is a bottleneck
                else:
                    if hasattr(seq, 'letter_annotations') and 'phred_quality' in seq.letter_annotations:
                        q_this = float(sum(seq.letter_annotations['phred_quality'])) / len(seq)
                        q_last = float(sum(seq_last.letter_annotations['phred_quality'])) / len(seq_last)
                        swap = q_this > q_last
                # Replace old sequence if criteria passed
                if swap:
                    dup_key = seq_last.id
                    #uniq_dict[match] = [seq, uniq_dict[match][1], ambig_count]
                    uniq_dict[match][0] = seq
                    uniq_dict[match][2] = ambig_count
                    
            # Update duplicate list
            dup_keys.append(dup_key)

        # Mark seq for removal from later steps
        to_remove.append(idx)
        
    # Remove matched sequences from search_keys
    for j in reversed(to_remove):  del search_keys[j]

    # Update progress
    printProgress(result_count, result_count, 0.05, start_time)
        
    return (uniq_dict, search_keys, dup_keys)
def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None,
                copy_fields=None, copy_actions=None, max_field=None, min_field=None, 
                inner=False, keep_missing=False, out_args=default_out_args):
    """
    Removes duplicate sequences from a file

    Arguments: 
    seq_file = filename of the sequence file to sample from
    max_missing = number of ambiguous characters to allow in a unique sequence
    uniq_fields = a list of annotations that define a sequence as unique if they differ
    copy_fields = a list of annotations to copy into unique sequence annotations
    copy_actions = the list of collapseAnnotation actions to take on copy_fields 
    max_field = a numeric field whose maximum value determines the retained sequence
    min_field = a numeric field whose minimum value determines the retained sequence
    inner = if True exclude consecutive outer ambiguous characters from iterations and matching
    keep_missing = if True retain sequences with more ambiguous characters than max_missing as unique
    out_args = common output argument dictionary from parseCommonArgs
              
    Returns: 
    the collapsed output file name
    """
    log = OrderedDict()
    log['START'] = 'CollapseSeq'
    log['FILE'] = os.path.basename(seq_file)
    log['MAX_MISSING'] = max_missing
    log['UNIQ_FIELDS'] = ','.join([str(x) for x in uniq_fields]) \
                         if uniq_fields is not None else None
    log['COPY_FIELDS'] = ','.join([str(x) for x in copy_fields]) \
                         if copy_fields is not None else None
    log['COPY_ACTIONS'] = ','.join([str(x) for x in copy_actions]) \
                          if copy_actions is not None else None
    log['MAX_FIELD'] = max_field
    log['MIN_FIELD'] = min_field
    log['INNER'] = inner
    log['KEEP_MISSING'] = keep_missing
    printLog(log)
    
    # TODO:  storing all sequences in memory is faster
    # Read input file
    in_type = getFileType(seq_file)
    #seq_dict = readSeqFile(seq_file, index=True)
    seq_dict = SeqIO.to_dict(readSeqFile(seq_file, index=False))
    if out_args['out_type'] is None:  out_args['out_type'] = in_type

    # Count total sequences
    rec_count = len(seq_dict)

    # Define log handle
    if out_args['log_file'] is None:  
        log_handle = None
    else:  
        log_handle = open(out_args['log_file'], 'w')

    # Find sequences with duplicates
    uniq_dict = {}
    # Added list typing for compatibility issue with Python 2.7.5 on OS X
    # TypeError: object of type 'dictionary-keyiterator' has no len()
    search_keys = list(seq_dict.keys())
    dup_keys = []
    for n in range(0, max_missing + 1):
        # Find unique sequences
        uniq_dict, search_keys, dup_list = findUniqueSeq(uniq_dict, search_keys, seq_dict, n, 
                                                         uniq_fields, copy_fields,
                                                         max_field, min_field, inner, 
                                                         out_args['delimiter'])
        # Update list of duplicates
        dup_keys.extend(dup_list)

        # Update log
        log = OrderedDict()
        log['ITERATION'] = n + 1
        log['MISSING'] = n 
        log['UNIQUE'] = len(uniq_dict) 
        log['DUPLICATE'] = len(dup_keys) 
        log['UNDETERMINED'] = len(search_keys)
        printLog(log, handle=log_handle)
                
        # Break if no keys to search remain
        if len(search_keys) == 0:  break
    
    # Write unique sequences
    with getOutputHandle(seq_file, 'collapse-unique', out_dir=out_args['out_dir'], 
                         out_name=out_args['out_name'], out_type=out_args['out_type']) \
            as uniq_handle:
        for val in uniq_dict.values():
            # Define output sequence
            out_seq = val[0]
            out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter'])
            out_app = OrderedDict()
            if copy_fields  is not None and copy_actions is not None:
                for f, a, s in zip(copy_fields, copy_actions, val[3:]):
                    out_app[f] = s
                    out_app = collapseAnnotation(out_app, a, f, delimiter=out_args['delimiter'])
                    out_ann.pop(f, None)
            out_app['DUPCOUNT'] = val[1]
            out_ann = mergeAnnotation(out_ann, out_app, delimiter=out_args['delimiter'])
            out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter'])
            out_seq.description = ''
            # Write unique sequence
            SeqIO.write(out_seq, uniq_handle, out_args['out_type'])
    
        # Write sequence with high missing character counts
        if keep_missing:
            for k in search_keys:
                out_seq = seq_dict[k]
                out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter'])
                out_ann = mergeAnnotation(out_ann, {'DUPCOUNT':1}, delimiter=out_args['delimiter'])
                out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter'])
                out_seq.description = ''
                SeqIO.write(out_seq, uniq_handle, out_args['out_type'])

    # Write sequence with high missing character counts
    if out_args['failed'] and not keep_missing:
        with getOutputHandle(seq_file, 'collapse-undetermined', out_dir=out_args['out_dir'],
                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
                as missing_handle:
            for k in search_keys:
                SeqIO.write(seq_dict[k], missing_handle, out_args['out_type'])

    if out_args['failed']:
        # Write duplicate sequences 
        with getOutputHandle(seq_file, 'collapse-duplicate', out_dir=out_args['out_dir'], 
                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
                as dup_handle:
            for k in dup_keys:
                SeqIO.write(seq_dict[k], dup_handle, out_args['out_type'])

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(uniq_handle.name)
    log['SEQUENCES'] = rec_count
    log['UNIQUE'] = len(uniq_dict)
    log['DUPLICATE'] = len(dup_keys)
    log['UNDETERMINED'] = len(search_keys)
    log['END'] = 'CollapseSeq'
    printLog(log)
        
    # Close file handles
    if log_handle is not None:  log_handle.close()
    
    return uniq_handle.name
def processAssembly(data, assemble_func, assemble_args={}, rc=None,
                   fields_1=None, fields_2=None, delimiter=default_delimiter):
    """
    Performs assembly of a sequence pair

    Arguments:
    data = a SeqData object with a list of exactly two SeqRecords
    assemble_func = the function to use to assemble paired ends
    assemble_args = a dictionary of arguments to pass to the assembly function
    rc = Defines which sequences ('head','tail','both') to reverse complement
         before assembly; if None do not reverse complement sequences
    fields_1 = list of annotations in head SeqRecord to copy to assembled record;
               if None do not copy an annotation
    fields_2 = list of annotations in tail SeqRecord to copy to assembled record;
               if None do not copy an annotation
    delimiter = a tuple of delimiters for (fields, values, value lists)

    Returns:
    a SeqResult object
    """
    # Reverse complement sequences if required
    head_seq = data.data[0] if rc not in ('head', 'both') \
               else reverseComplement(data.data[0])
    tail_seq = data.data[1] if rc not in ('tail', 'both') \
               else reverseComplement(data.data[1])

    # Define result object
    result = SeqResult(data.id, [head_seq, tail_seq])

    # Define stitched sequence annotation
    stitch_ann = OrderedDict([('ID', data.id)])
    if fields_1 is not None:
        head_ann = parseAnnotation(head_seq.description, fields_1,
                                   delimiter=delimiter)
        stitch_ann = mergeAnnotation(stitch_ann, head_ann, delimiter=delimiter)
        result.log['FIELDS1'] = '|'.join(['%s=%s' % (k, v)
                                             for k, v in head_ann.items()])
    if fields_2 is not None:
        tail_ann = parseAnnotation(tail_seq.description, fields_2,
                                   delimiter=delimiter)
        stitch_ann = mergeAnnotation(stitch_ann, tail_ann, delimiter=delimiter)
        result.log['FIELDS2'] = '|'.join(['%s=%s' % (k, v)
                                             for k, v in tail_ann.items()])

    # Assemble sequences
    stitch = assemble_func(head_seq, tail_seq, **assemble_args)
    ab = stitch.head_pos
    xy = stitch.tail_pos
    result.valid = stitch.valid

    # Add reference to log
    if stitch.ref_seq is not None and stitch.ref_pos is not None:
        result.log['REFID'] = stitch.ref_seq.id
        result.log['REFSEQ'] = ' ' * stitch.ref_pos[0] + stitch.ref_seq.seq

    if ab is not None and xy is not None:
        result.log['SEQ1'] = ' ' * xy[0] + head_seq.seq
        result.log['SEQ2'] = ' ' * ab[0] + tail_seq.seq
    else:
        result.log['SEQ1'] = head_seq.seq
        result.log['SEQ2'] = ' ' * (len(head_seq) + (stitch.gap or 0)) + tail_seq.seq

    # Define stitching log
    if stitch.seq is not None:
        # Update stitch annotation
        stitch.seq.id = flattenAnnotation(stitch_ann, delimiter=delimiter)
        stitch.seq.name = stitch.seq.id
        stitch.seq.description = ''
        result.results = stitch.seq
        # Add assembly to log
        result.log['ASSEMBLY'] = stitch.seq.seq
        if 'phred_quality' in stitch.seq.letter_annotations:
            result.log['QUALITY'] = ''.join([chr(q+33) for q in
                                             stitch.seq.letter_annotations['phred_quality']])
        result.log['LENGTH'] = len(stitch)
        result.log['OVERLAP'] = stitch.overlap
    else:
        result.log['ASSEMBLY'] = None

    # Add mode specific log results
    if stitch.gap is not None:
        result.log['GAP'] = stitch.gap
    if stitch.error is not None:
        result.log['ERROR'] = '%.4f' % stitch.error
    if stitch.pvalue is not None:
        result.log['PVALUE'] = '%.4e' % stitch.pvalue
    if stitch.evalue is not None:
        result.log['EVALUE1'] = '%.4e' % stitch.evalue[0]
        result.log['EVALUE2'] = '%.4e' % stitch.evalue[1]
    if stitch.ident is not None:
        result.log['IDENTITY'] = '%.4f' % stitch.ident


    return result
Example #22
0
def selectSeqFile(seq_file,
                  field,
                  value_list=None,
                  value_file=None,
                  negate=False,
                  out_file=None,
                  out_args=default_out_args):
    """
    Select from a sequence file

    Arguments:
      seq_file : filename of the sequence file to sample from.
      field : the annotation field to check for required values.
      value_list : a list of annotation values that a sample must contain one of.
      value_file : a tab delimited file containing values to select.
      negate : if True select entires that do not contain the specific values.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      str: output file name.
    """

    # Reads value_file
    def _read_file(value_file, field):
        field_list = []
        try:
            with open(value_file, 'rt') as handle:
                reader_dict = csv.DictReader(handle, dialect='excel-tab')
                for row in reader_dict:
                    field_list.append(row[field])
        except IOError:
            printError('File %s cannot be read.' % value_file)
        except:
            printError('File %s is invalid.' % value_file)

        return field_list

    # Print console log
    log = OrderedDict()
    log['START'] = 'SplitSeq'
    log['COMMAND'] = 'select'
    log['FILE'] = os.path.basename(seq_file)
    log['FIELD'] = field
    if value_list is not None:
        log['VALUE_LIST'] = ','.join([str(x) for x in value_list])
    if value_file is not None:
        log['VALUE_FILE'] = os.path.basename(value_file)
    log['NOT'] = negate
    printLog(log)

    # Read value_file
    if value_list is not None and value_file is not None:
        printError('Specify only one of value_list and value_file.')
    elif value_file is not None:
        value_list = _read_file(value_file, field)

    # Read sequence file
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None: out_args['out_type'] = in_type

    # Output output handle
    if out_file is not None:
        out_handle = open(out_file, 'w')
    else:
        out_handle = getOutputHandle(seq_file,
                                     'selected',
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type=out_args['out_type'])

    # Generate subset of records
    start_time = time()
    pass_count, fail_count, rec_count = 0, 0, 0
    value_set = set(value_list)
    for rec in seq_iter:
        printCount(rec_count, 1e5, start_time=start_time)
        rec_count += 1

        # Parse annotations into a list of values
        ann = parseAnnotation(rec.description,
                              delimiter=out_args['delimiter'])[field]
        ann = ann.split(out_args['delimiter'][2])

        # Write
        if xor(negate, not value_set.isdisjoint(ann)):
            # Write
            SeqIO.write(rec, out_handle, out_args['out_type'])
            pass_count += 1
        else:
            fail_count += 1

    printCount(rec_count, 1e5, start_time=start_time, end=True)

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'SplitSeq'
    printLog(log)

    return out_handle.name
Example #23
0
def writeDb(records, fields, aligner_file, total_count, id_dict=None, annotations=None,
            amino_acid=False, partial=False, asis_id=True, regions='default',
            writer=AIRRWriter, out_file=None, out_args=default_out_args):
    """
    Writes parsed records to an output file
    
    Arguments: 
      records : a iterator of Receptor objects containing alignment data.
      fields : a list of ordered field names to write.
      aligner_file : input file name.
      total_count : number of records (for progress bar).
      id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.
      annotations : additional annotation dictionary.
      amino_acid : if True do verification on amino acid fields.
      partial : if True put incomplete alignments in the pass file.
      asis_id : if ID is to be parsed for pRESTO output with default delimiters.
      regions (str): name of the IMGT FWR/CDR region definitions to use.
      writer : writer class.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      None
    """
    # Wrapper for opening handles and writers
    def _open(x, f, writer=writer, out_file=out_file):
        if out_file is not None and x == 'pass':
            handle = open(out_file, 'w')
        else:
            handle = getOutputHandle(aligner_file,
                                     out_label='db-%s' % x,
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type=out_args['out_type'])
        return handle, writer(handle, fields=f)

    # Function to convert fasta header annotations to changeo columns
    def _changeo(f, header):
        h = [ChangeoSchema.fromReceptor(x) for x in header if x.upper() not in f]
        f.extend(h)
        return f

    def _airr(f, header):
        h = [AIRRSchema.fromReceptor(x) for x in header if x.lower() not in f]
        f.extend(h)
        return f

    # Function to verify IMGT-gapped sequence and junction concur
    def _imgt_check(rec):
        try:
            if amino_acid:
                rd = RegionDefinition(rec.junction_aa_length, amino_acid=amino_acid, definition=regions)
                x, y = rd.positions['junction']
                check = (rec.junction_aa == rec.sequence_aa_imgt[x:y])
            else:
                rd = RegionDefinition(rec.junction_length, amino_acid=amino_acid, definition=regions)
                x, y = rd.positions['junction']
                check = (rec.junction == rec.sequence_imgt[x:y])
        except (TypeError, AttributeError):
            check = False
        return check

    # Function to check for valid records strictly
    def _strict(rec):
        if amino_acid:
            valid = [rec.v_call and rec.v_call != 'None',
                     rec.j_call and rec.j_call != 'None',
                     rec.functional is not None,
                     rec.sequence_aa_imgt,
                     rec.junction_aa,
                     _imgt_check(rec)]
        else:
            valid = [rec.v_call and rec.v_call != 'None',
                     rec.j_call and rec.j_call != 'None',
                     rec.functional is not None,
                     rec.sequence_imgt,
                     rec.junction,
                     _imgt_check(rec)]
        return all(valid)

    # Function to check for valid records loosely
    def _gentle(rec):
        valid = [rec.v_call and rec.v_call != 'None',
                 rec.d_call and rec.d_call != 'None',
                 rec.j_call and rec.j_call != 'None']
        return any(valid)

    # Set writer class and annotation conversion function
    if writer == ChangeoWriter:
        _annotate = _changeo
    elif writer == AIRRWriter:
        _annotate = _airr
    else:
        printError('Invalid output writer.')

    # Additional annotation (e.g. 10X cell calls)
    # _append_table = None
    # if cellranger_file is not None:
    #     with open(cellranger_file) as csv_file:
    #         # Read in annotation file (use Sniffer to discover file delimiters)
    #         dialect = csv.Sniffer().sniff(csv_file.readline())
    #         csv_file.seek(0)
    #         csv_reader = csv.DictReader(csv_file, dialect = dialect)
    #
    #         # Generate annotation dictionary
    #         anntab_dict = {entry['contig_id']: {cellranger_map[field]: entry[field] \
    #                        for field in cellranger_map.keys()} for entry in csv_reader}
    #
    #     fields = _annotate(fields, cellranger_map.values())
    #     _append_table = lambda sequence_id: anntab_dict[sequence_id]

    # Set pass criteria
    _pass = _gentle if partial else _strict

    # Define log handle
    if out_args['log_file'] is None:
        log_handle = None
    else:
        log_handle = open(out_args['log_file'], 'w')

    # Initialize handles, writers and counters
    pass_handle, pass_writer = None, None
    fail_handle, fail_writer = None, None
    pass_count, fail_count = 0, 0
    start_time = time()

    # Validate and write output
    printProgress(0, total_count, 0.05, start_time=start_time)
    for i, record in enumerate(records, start=1):
        # Replace sequence description with full string, if required
        if id_dict is not None and record.sequence_id in id_dict:
            record.sequence_id = id_dict[record.sequence_id]

        # Parse sequence description into new columns
        if not asis_id:
            try:
                ann_raw = parseAnnotation(record.sequence_id)
                record.sequence_id = ann_raw.pop('ID')

                # Convert to Receptor fields
                ann_parsed = OrderedDict()
                for k, v in ann_raw.items():
                    ann_parsed[ChangeoSchema.toReceptor(k)] = v

                # Add annotations to Receptor and update field list
                record.setDict(ann_parsed, parse=True)
                if i == 1:  fields = _annotate(fields, ann_parsed.keys())
            except IndexError:
                # Could not parse pRESTO-style annotations so fall back to no parse
                asis_id = True
                printWarning('Sequence annotation format not recognized. Sequence headers will not be parsed.')

        # Add supplemental annotation fields
        # if _append_table is not None:
        #     record.setDict(_append_table(record.sequence_id), parse=True)
        if annotations is not None:
            record.setDict(annotations[record.sequence_id], parse=True)
            if i == 1:  fields = _annotate(fields, annotations[record.sequence_id].keys())

        # Count pass or fail and write to appropriate file
        if _pass(record):
            pass_count += 1
            # Write row to pass file
            try:
                pass_writer.writeReceptor(record)
            except AttributeError:
                # Open pass file and writer
                pass_handle, pass_writer = _open('pass', fields)
                pass_writer.writeReceptor(record)
        else:
            fail_count += 1
            # Write row to fail file if specified
            if out_args['failed']:
                try:
                    fail_writer.writeReceptor(record)
                except AttributeError:
                    # Open fail file and writer
                    fail_handle, fail_writer = _open('fail', fields)
                    fail_writer.writeReceptor(record)

        # Write log
        if log_handle is not None:
            log = OrderedDict([('ID', record.sequence_id),
                               ('V_CALL', record.v_call),
                               ('D_CALL', record.d_call),
                               ('J_CALL', record.j_call),
                               ('PRODUCTIVE', record.functional)])
            if not _imgt_check(record) and not amino_acid:
                log['ERROR'] = 'Junction does not match the sequence starting at position 310 in the IMGT numbered V(D)J sequence.'
            printLog(log, log_handle)

        # Print progress
        printProgress(i, total_count, 0.05, start_time=start_time)

    # Print console log
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'MakeDb'
    printLog(log)

    # Close file handles
    output = {'pass': None, 'fail': None}
    if pass_handle is not None:
        output['pass'] = pass_handle.name
        pass_handle.close()
    if fail_handle is not None:
        output['fail'] = fail_handle.name
        fail_handle.close()

    return output
def modifyHeaders(seq_file, modify_func, modify_args, out_args=default_out_args):
    """
    Modifies sequence headers

    Arguments: 
    seq_file = the sequence file name
    modify_func = the function defining the modification operation
    modify_args = a dictionary of arguments to pass to modify_func
    out_args = common output argument dictionary from parseCommonArgs
                    
    Returns: 
    the output file name
    """
    # Define subcommand label dictionary
    cmd_dict = {addHeader: 'add',
                copyHeader: 'copy',
                collapseHeader: 'collapse',
                deleteHeader: 'delete',
                expandHeader: 'expand',
                renameHeader: 'rename'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'ParseHeaders'
    log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__)
    log['FILE'] = os.path.basename(seq_file)
    for k in sorted(modify_args):  
        v = modify_args[k]
        log[k.upper()] = ','.join(v) if isinstance(v, list) else v
    printLog(log)
    
    # Open file handles
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type
    out_handle = getOutputHandle(seq_file, 'reheader', out_dir=out_args['out_dir'],
                                 out_name=out_args['out_name'], out_type=out_args['out_type'])

    # Count records
    result_count = countSeqFile(seq_file)
    
    # Iterate over sequences
    start_time = time()
    seq_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, result_count, 0.05, start_time)
        
        #Update counts
        seq_count += 1
        
        # Modify header
        header = parseAnnotation(seq.description, delimiter=out_args['delimiter'])
        header = modify_func(header, delimiter=out_args['delimiter'], **modify_args)
        
        # Write new sequence
        seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter'])
        seq.description = ''
        SeqIO.write(seq, out_handle, out_args['out_type'])
        
    # print counts
    printProgress(seq_count, result_count, 0.05, start_time)    
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['SEQUENCES'] = seq_count
    log['END'] = 'ParseHeaders'               
    printLog(log)

    # Close file handles
    out_handle.close()
 
    return out_handle.name
Example #25
0
def processQueue(alive,
                 data_queue,
                 result_queue,
                 cluster_func,
                 cluster_args={},
                 cluster_field=default_cluster_field,
                 cluster_prefix=default_cluster_prefix,
                 delimiter=default_delimiter):
    """
    Pulls from data queue, performs calculations, and feeds results queue

    Arguments:
      alive : a multiprocessing.Value boolean controlling whether processing
              continues; when False function returns.
      data_queue : a multiprocessing.Queue holding data to process.
      result_queue : a multiprocessing.Queue to hold processed results.
      cluster_func : the function to use for clustering.
      cluster_args : a dictionary of optional arguments for the clustering function.
      cluster_field : string defining the output cluster field name.
      cluster_prefix : string defining a prefix for the cluster identifier.
      delimiter : a tuple of delimiters for (annotations, field/values, value lists).

    Returns:
      None
    """
    try:
        # Iterator over data queue until sentinel object reached
        while alive.value:
            # Get data from queue
            if data_queue.empty(): continue
            else: data = data_queue.get()
            # Exit upon reaching sentinel
            if data is None: break

            # Define result object
            result = SeqResult(data.id, data.data)
            result.log['BARCODE'] = data.id
            result.log['SEQCOUNT'] = len(data)

            # Perform clustering
            cluster_dict = cluster_func(data.data, **cluster_args)

            # Process failed result
            if cluster_dict is None:
                # Update log
                result.log['CLUSTERS'] = 0
                for i, seq in enumerate(data.data, start=1):
                    result.log['CLUST0-%i' % i] = str(seq.seq)

                # Feed results queue and continue
                result_queue.put(result)
                continue

            # Get number of clusters
            result.log['CLUSTERS'] = len(cluster_dict)

            # Update sequence annotations with cluster assignments
            results = list()
            seq_dict = {s.id: s for s in data.data}
            for cluster, id_list in cluster_dict.items():
                for i, seq_id in enumerate(id_list, start=1):
                    # Add cluster annotation
                    seq = seq_dict[seq_id]
                    label = '%s%i' % (cluster_prefix, cluster)
                    header = parseAnnotation(seq.description,
                                             delimiter=delimiter)
                    header = mergeAnnotation(header, {cluster_field: label},
                                             delimiter=delimiter)
                    seq.id = seq.name = flattenAnnotation(header,
                                                          delimiter=delimiter)
                    seq.description = ''

                    # Update log and results
                    result.log['CLUST%i-%i' % (cluster, i)] = str(seq.seq)
                    results.append(seq)

            # Check results
            result.results = results
            result.valid = (len(results) == len(seq_dict))
            # Feed results to result queue
            result_queue.put(result)
        else:
            sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())

            return None
    except:
        alive.value = False
        printError('Error processing sequence set with ID: %s.' % data.id,
                   exit=False)
        raise

    return None
def processCSQueue(alive, data_queue, result_queue, cluster_field,
                  cluster_args={}, delimiter=default_delimiter):
    """
    Pulls from data queue, performs calculations, and feeds results queue

    Arguments:
    alive = a multiprocessing.Value boolean controlling whether processing
            continues; when False function returns
    data_queue = a multiprocessing.Queue holding data to process
    result_queue = a multiprocessing.Queue to hold processed results
    cluster_args = a dictionary of optional arguments for the clustering function
    delimiter = a tuple of delimiters for (annotations, field/values, value lists)

    Returns:
    None
    """
    try:
        # Iterator over data queue until sentinel object reached
        while alive.value:
            # Get data from queue
            if data_queue.empty():  continue
            else:  data = data_queue.get()
            # Exit upon reaching sentinel
            if data is None:  break

            # Define result object
            result = SeqResult(data.id, data.data)
            result.log['BARCODE'] = data.id
            result.log['SEQCOUNT'] = len(data)

            # Perform clustering
            cluster_dict = runUClust(data.data, **cluster_args)

            # Process failed result
            if cluster_dict is None:
                # Update log
                result.log['CLUSTERS'] = 0
                for i, seq in enumerate(data.data, start=1):
                    result.log['CLUST0-%i' % i] = str(seq.seq)

                # Feed results queue and continue
                result_queue.put(result)
                continue

            # Get number of clusters
            result.log['CLUSTERS'] = len(cluster_dict)

            # Update sequence annotations with cluster assignments
            results = list()
            seq_dict = {s.id: s for s in data.data}
            for clust, id_list in cluster_dict.items():
                for i, seq_id in enumerate(id_list, start=1):
                    # Add cluster annotation
                    seq = seq_dict[seq_id]
                    header = parseAnnotation(seq.description, delimiter=delimiter)
                    header = mergeAnnotation(header, {cluster_field:clust}, delimiter=delimiter)
                    seq.id = seq.name = flattenAnnotation(header, delimiter=delimiter)
                    seq.description = ''

                    # Update log and results
                    result.log['CLUST%i-%i' % (clust, i)] = str(seq.seq)
                    results.append(seq)

            # Check results
            result.results = results
            result.valid = (len(results) == len(seq_dict))

            # Feed results to result queue
            result_queue.put(result)
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())

            return None
    except:
        alive.value = False
        sys.stderr.write('Error processing sequence set with ID: %s.\n' % data.id)

        raise

    return None
Example #27
0
def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False,
            out_args=default_out_args):
    """
    Writes tab-delimited database file in output directory.
    
    Arguments:
      db : a iterator of IgRecord objects containing alignment data.
      fields : a list of ordered field names to write.
      file_prefix : directory and prefix for CLIP tab-delim file.
      total_count : number of records (for progress bar).
      id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.
      no_parse : if ID is to be parsed for pRESTO output with default delimiters.
      partial : if True put incomplete alignments in the pass file.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      None
    """
    # Function to check for valid records strictly
    def _pass_strict(rec):
        valid = [rec.v_call and rec.v_call != 'None',
                 rec.j_call and rec.j_call != 'None',
                 rec.functional is not None,
                 rec.seq_vdj,
                 rec.junction]
        return all(valid)

    # Function to check for valid records loosely
    def _pass_gentle(rec):
        valid = [rec.v_call and rec.v_call != 'None',
                 rec.d_call and rec.d_call != 'None',
                 rec.j_call and rec.j_call != 'None']
        return any(valid)

    # Set pass criteria
    _pass = _pass_gentle if partial else _pass_strict

    # Define output file names
    pass_file = '%s_db-pass.tab' % file_prefix
    fail_file = '%s_db-fail.tab' % file_prefix

    # Initiate handles, writers and counters
    pass_handle = None
    fail_handle = None
    pass_writer = None
    fail_writer = None
    start_time = time()
    rec_count = pass_count = fail_count = 0

    # Validate and write output
    printProgress(0, total_count, 0.05, start_time)
    for i, record in enumerate(db, start=1):

        # Replace sequence description with full string, if required
        if id_dict is not None and record.id in id_dict:
            record.id = id_dict[record.id]

        # Parse sequence description into new columns
        if not no_parse:
            try:
                record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
                record.id = record.annotations['ID']
                del record.annotations['ID']

                # TODO:  This is not the best approach. should pass in output fields.
                # If first record, use parsed description to define extra columns
                if i == 1:  fields.extend(list(record.annotations.keys()))
            except IndexError:
                # Could not parse pRESTO-style annotations so fall back to no parse
                no_parse = True
                sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n')

        # Count pass or fail and write to appropriate file
        if _pass(record):
            # Open pass file
            if pass_writer is None:
                pass_handle = open(pass_file, 'wt')
                pass_writer = getDbWriter(pass_handle, add_fields=fields)

            # Write row to pass file
            pass_count += 1
            pass_writer.writerow(record.toDict())
        else:
            # Open failed file
            if out_args['failed'] and fail_writer is None:
                fail_handle = open(fail_file, 'wt')
                fail_writer = getDbWriter(fail_handle, add_fields=fields)

            # Write row to fail file if specified
            fail_count += 1
            if fail_writer is not None:
                fail_writer.writerow(record.toDict())

        # Print progress
        printProgress(i, total_count, 0.05, start_time)

    # Print consol log
    log = OrderedDict()
    log['OUTPUT'] = pass_file
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'MakeDb'
    printLog(log)
    
    if pass_handle is not None: pass_handle.close()
    if fail_handle is not None: fail_handle.close()
def pairSeq(seq_file_1, seq_file_2, fields_1=None, fields_2=None,
            coord_type=default_coord_type,
            out_args=default_out_args):
    """
    Generates consensus sequences

    Arguments: 
    seq_file_1 = the file containing the grouped sequences and annotations
    seq_file_2 = the file to assign annotations to from seq_file_1
    fields_1 = list of annotations in seq_file_1 records to copy to seq_file_2 records;
               if None do not copy any annotations
    fields_2 = list of annotations in seq_file_2 records to copy to seq_file_1 records;
               if None do not copy any annotations
    coord_type = the sequence header format
    out_args = common output argument dictionary from parseCommonArgs
                    
    Returns: 
    a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2)
    """
    # Define private functions
    def _key_func(x):
        return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter'])

    log = OrderedDict()
    log['START'] = 'PairSeq'
    log['FILE1'] = os.path.basename(seq_file_1)
    log['FILE2'] = os.path.basename(seq_file_2)
    log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None
    log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None
    log['COORD_TYPE'] = coord_type
    printLog(log)

    # Define output type
    if out_args['out_type'] is None:
        out_type_1 = getFileType(seq_file_1)
        out_type_2 = getFileType(seq_file_2)
    else: 
        out_type_1 = out_type_2 = out_args['out_type']

    # Define output name
    if out_args['out_name'] is None:
        out_name_1 = out_name_2 = None
    else: 
        out_name_1 = '%s-1' % out_args['out_name']
        out_name_2 = '%s-2' % out_args['out_name']

    # Open and count files
    start_time = time()
    printMessage("Indexing files", start_time=start_time)
    # Index file 1
    seq_count_1 = countSeqFile(seq_file_1)
    seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func)
    # Define file 2 iterator
    seq_count_2 = countSeqFile(seq_file_2)
    seq_iter_2 = readSeqFile(seq_file_2, index=False)
    printMessage("Done", start_time=start_time, end=True)

    # Open output file handles
    pass_handle_1 = getOutputHandle(seq_file_1, 'pair-pass', out_args['out_dir'], 
                                    out_name=out_name_1, out_type=out_type_1)
    pass_handle_2 = getOutputHandle(seq_file_2, 'pair-pass', out_args['out_dir'], 
                                    out_name=out_name_2, out_type=out_type_2)

    if out_args['failed']:
        fail_handle_1 = getOutputHandle(seq_file_1, 'pair-fail', out_dir=out_args['out_dir'],
                                        out_name=out_name_1, out_type=out_type_1)
        fail_handle_2 = getOutputHandle(seq_file_2, 'pair-fail', out_dir=out_args['out_dir'],
                                        out_name=out_name_2, out_type=out_type_2)
        pass_keys = list()

    # Iterate over pairs and write to output files
    start_time = time()
    rec_count = pair_count = 0
    for seq_2 in seq_iter_2:
        # Print progress for previous iteration
        printProgress(rec_count, seq_count_2, 0.05, start_time)
        rec_count += 1

        # Check for file 2 mate pair in file 1
        coord_2 = getCoordKey(seq_2.id, coord_type=coord_type,
                              delimiter=out_args['delimiter'])
        seq_1 = seq_dict_1.get(coord_2, None)

        if seq_1 is not None:
            # Record paired keys
            pair_count += 1

            if fields_1 is not None or fields_2 is not None:
                ann_1 = parseAnnotation(seq_1.description,
                                        delimiter=out_args['delimiter'])
                ann_2 = parseAnnotation(seq_2.description,
                                        delimiter=out_args['delimiter'])

                # Prepend annotations from seq_1 to seq_2
                if fields_1 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \
                                            if k in fields_1])
                    merge_ann = mergeAnnotation(ann_2, copy_ann, prepend=True,
                                                delimiter=out_args['delimiter'])
                    seq_2.id = flattenAnnotation(merge_ann,
                                                 delimiter=out_args['delimiter'])
                    seq_2.description = ''

                # Append annotations from seq_2 to seq_1
                if fields_2 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \
                                            if k in fields_2])
                    merge_ann = mergeAnnotation(ann_1, copy_ann, prepend=False,
                                                delimiter=out_args['delimiter'])
                    seq_1.id = flattenAnnotation(merge_ann,
                                                 delimiter=out_args['delimiter'])
                    seq_1.description = ''

            # Write paired records
            SeqIO.write(seq_1, pass_handle_1, out_type_1)
            SeqIO.write(seq_2, pass_handle_2, out_type_2)

        # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records
        if out_args['failed']:
            if seq_1 is not None:  pass_keys.append(coord_2)
            else:  SeqIO.write(seq_2, fail_handle_2, out_type_2)

    # Print final progress
    printProgress(rec_count, seq_count_2, 0.05, start_time)

    # Find and write unpaired file 1 records
    if out_args['failed']:
        start_time = time()
        printMessage("Finding unpaired", start_time=start_time)

        # Find file 1 unpaired keys
        pass_keys = set(pass_keys)
        unpaired = set(seq_dict_1).difference(pass_keys)
        # Write unpaired file 1 records
        for k in unpaired:  SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1)

        printMessage("Done", start_time=start_time, end=True)

    # Print log
    log = OrderedDict()
    log['OUTPUT1'] = os.path.basename(pass_handle_1.name)
    log['OUTPUT2'] = os.path.basename(pass_handle_2.name)
    log['SEQUENCES1'] = seq_count_1
    log['SEQUENCES2'] = seq_count_2
    log['PASS'] = pair_count
    log['END'] = 'PairSeq'
    printLog(log)
   
    # Close file handles
    pass_handle_1.close()
    pass_handle_2.close()

    return [(pass_handle_1.name, pass_handle_2.name)]
Example #29
0
def tableHeaders(seq_file, fields, out_file=None, out_args=default_out_args):
    """
    Builds a table of sequence header annotations

    Arguments: 
      seq_file : the sequence file name.
      fields : the list of fields to output.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
                    
    Returns: 
      str: output table file name
    """
    log = OrderedDict()
    log['START'] = 'ParseHeaders'
    log['COMMAND'] = 'table'
    log['FILE'] = os.path.basename(seq_file)
    printLog(log)
    
    # Open file handles
    seq_iter = readSeqFile(seq_file)
    if out_file is not None:
        out_handle = open(out_file, 'w')
    else:
        out_handle = getOutputHandle(seq_file,
                                     'headers',
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type='tab')
    # Count records
    result_count = countSeqFile(seq_file)
    
    # Open csv writer and write header
    out_writer = csv.DictWriter(out_handle, extrasaction='ignore', restval='', 
                                delimiter='\t', fieldnames=fields)
    out_writer.writeheader()
    
    # Iterate over sequences
    start_time = time()
    seq_count = pass_count = fail_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, result_count, 0.05, start_time=start_time)
        
        # Get annotations
        seq_count += 1
        ann = parseAnnotation(seq.description, fields, delimiter=out_args['delimiter'])

        # Write records
        if ann:
            pass_count += 1
            out_writer.writerow(ann)
        else:
            fail_count += 1
        
    # Print counts
    printProgress(seq_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['SEQUENCES'] = seq_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'ParseHeaders'
    printLog(log)

    # Close file handles
    out_handle.close()
 
    return out_handle.name
Example #30
0
 def _barcode(seq, field=barcode_field, delimiter=out_args['delimiter']):
     header = parseAnnotation(seq.description, delimiter=delimiter)
     return header[field]
Example #31
0
def assemblyWorker(data,
                   assemble_func,
                   assemble_args={},
                   rc='tail',
                   fields_1=None,
                   fields_2=None,
                   delimiter=default_delimiter):
    """
    Performs assembly of a sequence pair

    Arguments:
      data : a SeqData object with a list of exactly two SeqRecords.
      assemble_func : the function to use to assemble paired ends.
      assemble_args : a dictionary of arguments to pass to the assembly function.
      rc : Defines which sequences ('head', 'tail', 'both', 'none') to reverse complement
           before assembly; if None do not reverse complement sequences.
      fields_1 : list of annotations in head SeqRecord to copy to assembled record;
                 if None do not copy an annotation.
      fields_2 : list of annotations in tail SeqRecord to copy to assembled record;
                 if None do not copy an annotation.
      delimiter : a tuple of delimiters for (fields, values, value lists).

    Returns:
      SeqResult: a SeqResult object
    """
    # Define result object
    result = SeqResult(data.id, data.data)

    # Reverse complement sequences if required
    head_seq = data.data[0] if rc not in ('head', 'both') \
               else reverseComplement(data.data[0])
    tail_seq = data.data[1] if rc not in ('tail', 'both') \
               else reverseComplement(data.data[1])

    # Define stitched sequence annotation
    stitch_ann = OrderedDict([('ID', data.id)])
    if fields_1 is not None:
        head_ann = parseAnnotation(head_seq.description,
                                   fields_1,
                                   delimiter=delimiter)
        stitch_ann = mergeAnnotation(stitch_ann, head_ann, delimiter=delimiter)
        result.log['FIELDS1'] = '|'.join(
            ['%s=%s' % (k, v) for k, v in head_ann.items()])
    if fields_2 is not None:
        tail_ann = parseAnnotation(tail_seq.description,
                                   fields_2,
                                   delimiter=delimiter)
        stitch_ann = mergeAnnotation(stitch_ann, tail_ann, delimiter=delimiter)
        result.log['FIELDS2'] = '|'.join(
            ['%s=%s' % (k, v) for k, v in tail_ann.items()])

    # Assemble sequences
    stitch = assemble_func(head_seq, tail_seq, **assemble_args)
    ab = stitch.head_pos
    xy = stitch.tail_pos
    result.valid = stitch.valid

    # Add reference to log
    if stitch.ref_seq is not None and stitch.ref_pos is not None:
        result.log['REFID'] = stitch.ref_seq.id
        result.log['REFSEQ'] = ' ' * stitch.ref_pos[0] + stitch.ref_seq.seq

    if ab is not None and xy is not None:
        result.log['SEQ1'] = ' ' * xy[0] + head_seq.seq
        result.log['SEQ2'] = ' ' * ab[0] + tail_seq.seq
    else:
        result.log['SEQ1'] = head_seq.seq
        result.log['SEQ2'] = ' ' * (len(head_seq) +
                                    (stitch.gap or 0)) + tail_seq.seq

    # Define stitching log
    if stitch.seq is not None:
        # Update stitch annotation
        stitch.seq.id = flattenAnnotation(stitch_ann, delimiter=delimiter)
        stitch.seq.name = stitch.seq.id
        stitch.seq.description = ''
        result.results = stitch.seq
        # Add assembly to log
        result.log['ASSEMBLY'] = stitch.seq.seq
        if 'phred_quality' in stitch.seq.letter_annotations:
            result.log['QUALITY'] = ''.join([
                chr(q + 33)
                for q in stitch.seq.letter_annotations['phred_quality']
            ])
        result.log['LENGTH'] = len(stitch)
        result.log['OVERLAP'] = stitch.overlap
    else:
        result.log['ASSEMBLY'] = None

    # Add mode specific log results
    if stitch.gap is not None:
        result.log['GAP'] = stitch.gap
    if stitch.error is not None:
        result.log['ERROR'] = '%.4f' % stitch.error
    if stitch.pvalue is not None:
        result.log['PVALUE'] = '%.4e' % stitch.pvalue
    if stitch.evalue is not None:
        result.log['EVALUE1'] = '%.4e' % stitch.evalue[0]
        result.log['EVALUE2'] = '%.4e' % stitch.evalue[1]
    if stitch.ident is not None:
        result.log['IDENTITY'] = '%.4f' % stitch.ident

    return result
Example #32
0
def processQueue(alive, data_queue, result_queue, align_func, align_args={},
                 calc_div=False, delimiter=default_delimiter):
    """
    Pulls from data queue, performs calculations, and feeds results queue

    Arguments: 
      alive : a multiprocessing.Value boolean controlling whether processing
              continues; when False function returns
      data_queue : a multiprocessing.Queue holding data to process
      result_queue : a multiprocessing.Queue to hold processed results
      align_func : the function to use for alignment
      align_args : a dictionary of optional arguments for the alignment function
      calc_div : if True perform diversity calculation
      delimiter : a tuple of delimiters for (annotations, field/values, value lists)

    Returns:
      None
    """
    try:
        # Iterator over data queue until sentinel object reached
        while alive.value:
            # Get data from queue
            if data_queue.empty():  continue
            else:  data = data_queue.get()
            # Exit upon reaching sentinel
            if data is None:  break
            
            # Define result object
            result = SeqResult(data.id, data.data)
            result.log['BARCODE'] = data.id
            result.log['SEQCOUNT'] = len(data)
    
            # Perform alignment
            seq_list = data.data
            align_list = align_func(seq_list, **align_args)
    
            # Process alignment
            if align_list is not None:
                # Calculate diversity
                if calc_div:
                    diversity = calculateDiversity(align_list)
                    result.log['DIVERSITY'] = diversity
                
                # Restore quality scores
                has_quality = hasattr(seq_list[0], 'letter_annotations') and \
                              'phred_quality' in seq_list[0].letter_annotations
                if has_quality:
                    qual_dict = {seq.id:seq.letter_annotations['phred_quality'] \
                                 for seq in seq_list}
                    for seq in align_list:
                        qual = deque(qual_dict[seq.id])
                        qual_new = [0 if c == '-' else qual.popleft() for c in seq.seq]
                        seq.letter_annotations['phred_quality'] = qual_new
    
                # Add alignment to log
                if 'field' in align_args:
                    for i, seq in enumerate(align_list):
                        ann = parseAnnotation(seq.description, delimiter=delimiter)
                        primer = ann[align_args['field']]
                        result.log['ALIGN%i:%s' % (i + 1, primer)] = seq.seq
                else:
                    for i, seq in enumerate(align_list):  
                        result.log['ALIGN%i' % (i + 1)] = seq.seq
                
                # Add alignment to results
                result.results = align_list
                result.valid = True
                        
            # Feed results to result queue
            result_queue.put(result)
        else:
            sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None
    except:
        alive.value = False
        printError('Processing sequence set with ID: %s.' % data.id, exit=False)
        raise
    
    return None
Example #33
0
def pairSeq(seq_file_1,
            seq_file_2,
            fields_1=None,
            fields_2=None,
            action=None,
            coord_type=default_coord,
            out_args=default_out_args):
    """
    Syncronized paired end files and copies annotations between them

    Arguments: 
      seq_file_1 : the file containing the grouped sequences and annotations.
      seq_file_2 : the file to assign annotations to from seq_file_1.
      fields_1 : list of annotations in seq_file_1 records to copy to seq_file_2 records;
                 if None do not copy any annotations.
      fields_2 : list of annotations in seq_file_2 records to copy to seq_file_1 records;
                 if None do not copy any annotations.
      action : the collapse action to take on all copied annotation if they already exist in the
               target header.
      coord_type : the sequence header format.
      out_args : common output argument dictionary from parseCommonArgs.
                    
    Returns: 
      list: a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2).
    """

    # Define private functions
    def _key_func(x):
        return getCoordKey(x,
                           coord_type=coord_type,
                           delimiter=out_args['delimiter'])

    log = OrderedDict()
    log['START'] = 'PairSeq'
    log['FILE1'] = os.path.basename(seq_file_1)
    log['FILE2'] = os.path.basename(seq_file_2)
    log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None
    log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None
    log['COORD_TYPE'] = coord_type
    printLog(log)

    # Define output type
    if out_args['out_type'] is None:
        out_type_1 = getFileType(seq_file_1)
        out_type_2 = getFileType(seq_file_2)
    else:
        out_type_1 = out_type_2 = out_args['out_type']

    # Define output name
    if out_args['out_name'] is None:
        out_name_1 = out_name_2 = None
    else:
        out_name_1 = '%s-1' % out_args['out_name']
        out_name_2 = '%s-2' % out_args['out_name']

    # Open and count files
    start_time = time()
    printMessage("Indexing files", start_time=start_time)
    # Index file 1
    seq_count_1 = countSeqFile(seq_file_1)
    seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func)
    # Define file 2 iterator
    seq_count_2 = countSeqFile(seq_file_2)
    seq_iter_2 = readSeqFile(seq_file_2, index=False)
    printMessage("Done", start_time=start_time, end=True)

    # Open output file handles
    pass_handle_1 = getOutputHandle(seq_file_1,
                                    'pair-pass',
                                    out_args['out_dir'],
                                    out_name=out_name_1,
                                    out_type=out_type_1)
    pass_handle_2 = getOutputHandle(seq_file_2,
                                    'pair-pass',
                                    out_args['out_dir'],
                                    out_name=out_name_2,
                                    out_type=out_type_2)

    if out_args['failed']:
        fail_handle_1 = getOutputHandle(seq_file_1,
                                        'pair-fail',
                                        out_dir=out_args['out_dir'],
                                        out_name=out_name_1,
                                        out_type=out_type_1)
        fail_handle_2 = getOutputHandle(seq_file_2,
                                        'pair-fail',
                                        out_dir=out_args['out_dir'],
                                        out_name=out_name_2,
                                        out_type=out_type_2)
        pass_keys = list()

    # Iterate over pairs and write to output files
    start_time = time()
    rec_count = pair_count = 0
    for seq_2 in seq_iter_2:
        # Print progress for previous iteration
        printProgress(rec_count, seq_count_2, 0.05, start_time=start_time)
        rec_count += 1

        # Check for file 2 mate pair in file 1
        coord_2 = getCoordKey(seq_2.id,
                              coord_type=coord_type,
                              delimiter=out_args['delimiter'])
        seq_1 = seq_dict_1.get(coord_2, None)

        if seq_1 is not None:
            # Record paired keys
            pair_count += 1

            if fields_1 is not None or fields_2 is not None:
                ann_1 = parseAnnotation(seq_1.description,
                                        delimiter=out_args['delimiter'])
                ann_2 = parseAnnotation(seq_2.description,
                                        delimiter=out_args['delimiter'])

                # Prepend annotations from seq_1 to seq_2
                if fields_1 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \
                                            if k in fields_1])
                    merge_ann = mergeAnnotation(
                        ann_2,
                        copy_ann,
                        prepend=True,
                        delimiter=out_args['delimiter'])
                    # Collapse if necessary
                    if action is not None:
                        merge_ann = collapseAnnotation(
                            merge_ann,
                            action,
                            fields=fields_1,
                            delimiter=out_args['delimiter'])
                    # Flatten
                    seq_2.id = flattenAnnotation(
                        merge_ann, delimiter=out_args['delimiter'])
                    seq_2.description = ''

                # Append annotations from seq_2 to seq_1
                if fields_2 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \
                                            if k in fields_2])
                    merge_ann = mergeAnnotation(
                        ann_1,
                        copy_ann,
                        prepend=False,
                        delimiter=out_args['delimiter'])
                    # Collapse if necessary
                    if action is not None:
                        merge_ann = collapseAnnotation(
                            merge_ann,
                            action,
                            fields=fields_2,
                            delimiter=out_args['delimiter'])
                    # Flatten
                    seq_1.id = flattenAnnotation(
                        merge_ann, delimiter=out_args['delimiter'])
                    seq_1.description = ''

            # Write paired records
            SeqIO.write(seq_1, pass_handle_1, out_type_1)
            SeqIO.write(seq_2, pass_handle_2, out_type_2)

        # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records
        if out_args['failed']:
            if seq_1 is not None: pass_keys.append(coord_2)
            else: SeqIO.write(seq_2, fail_handle_2, out_type_2)

    # Print final progress
    printProgress(rec_count, seq_count_2, 0.05, start_time=start_time)

    # Find and write unpaired file 1 records
    if out_args['failed']:
        start_time = time()
        printMessage("Finding unpaired", start_time=start_time)

        # Find file 1 unpaired keys
        pass_keys = set(pass_keys)
        unpaired = set(seq_dict_1).difference(pass_keys)
        # Write unpaired file 1 records
        for k in unpaired:
            SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1)

        printMessage("Done", start_time=start_time, end=True)

    # Print log
    log = OrderedDict()
    log['OUTPUT1'] = os.path.basename(pass_handle_1.name)
    log['OUTPUT2'] = os.path.basename(pass_handle_2.name)
    log['SEQUENCES1'] = seq_count_1
    log['SEQUENCES2'] = seq_count_2
    log['PASS'] = pair_count
    log['END'] = 'PairSeq'
    printLog(log)

    # Close file handles
    pass_handle_1.close()
    pass_handle_2.close()

    return [(pass_handle_1.name, pass_handle_2.name)]