def buildSeqRecord(db_record, id_field, seq_field, meta_fields=None):
    """
    Parses a database record into a SeqRecord

    Arguments: 
      db_record : a dictionary containing a database record.
      id_field : the field containing identifiers.
      seq_field : the field containing sequences.
      meta_fields : a list of fields to add to sequence annotations.

    Returns: 
      Bio.SeqRecord.SeqRecord: record.
    """
    # Return None if ID or sequence fields are empty
    if not db_record[id_field] or not db_record[seq_field]:
        return None
    
    # Create description string
    desc_dict = OrderedDict([('ID', db_record[id_field])])
    if meta_fields is not None:
        desc_dict.update([(f, db_record[f]) for f in meta_fields if f in db_record]) 
    desc_str = flattenAnnotation(desc_dict)
    
    # Create SeqRecord
    seq_record = SeqRecord(Seq(db_record[seq_field], IUPAC.ambiguous_dna),
                           id=desc_str, name=desc_str, description='')
        
    return seq_record
def getDbSeqRecord(db_record, id_field, seq_field, meta_fields=None, 
                   delimiter=default_delimiter):
    """
    Parses a database record into a SeqRecord

    Arguments: 
    db_record = a dictionary containing a database record
    id_field = the field containing identifiers
    seq_field = the field containing sequences
    meta_fields = a list of fields to add to sequence annotations
    delimiter = a tuple of delimiters for (fields, values, value lists) 

    Returns: 
    a SeqRecord
    """
    # Return None if ID or sequence fields are empty
    if not db_record[id_field] or not db_record[seq_field]:
        return None
    
    # Create description string
    desc_dict = OrderedDict([('ID', db_record[id_field])])
    if meta_fields is not None:
        desc_dict.update([(f, db_record[f]) for f in meta_fields if f in db_record]) 
    desc_str = flattenAnnotation(desc_dict, delimiter=delimiter)
    
    # Create SeqRecord
    seq_record = SeqRecord(Seq(db_record[seq_field], IUPAC.ambiguous_dna),
                           id=desc_str, name=desc_str, description='')
        
    return seq_record
Exemple #3
0
 def _header(seq,
             cluster,
             field=cluster_field,
             prefix=cluster_prefix,
             delimiter=out_args['delimiter']):
     label = '%s%i' % (prefix, cluster)
     header = parseAnnotation(seq.description, delimiter=delimiter)
     header = mergeAnnotation(header, {field: label}, delimiter=delimiter)
     seq.id = seq.name = flattenAnnotation(header, delimiter=delimiter)
     seq.description = ''
     return seq
Exemple #4
0
def getDbSeqRecord(db_record,
                   id_field,
                   seq_field,
                   meta_fields=None,
                   delimiter=default_delimiter):
    """
    Parses a database record into a SeqRecord

    Arguments: 
    db_record = a dictionary containing a database record
    id_field = the field containing identifiers
    seq_field = the field containing sequences
    meta_fields = a list of fields to add to sequence annotations
    delimiter = a tuple of delimiters for (fields, values, value lists) 

    Returns: 
    a SeqRecord
    """
    # Return None if ID or sequence fields are empty
    if not db_record[id_field] or not db_record[seq_field]:
        return None

    # Create description string
    desc_dict = OrderedDict([('ID', db_record[id_field])])
    if meta_fields is not None:
        desc_dict.update([(f, db_record[f]) for f in meta_fields
                          if f in db_record])
    desc_str = flattenAnnotation(desc_dict, delimiter=delimiter)

    # Create SeqRecord
    seq_record = SeqRecord(Seq(db_record[seq_field], IUPAC.ambiguous_dna),
                           id=desc_str,
                           name=desc_str,
                           description='')

    return seq_record
def processBCQueue(alive, data_queue, result_queue, cons_func, cons_args={},
                   min_count=default_min_count, primer_field=None, primer_freq=None,
                   max_gap=None, max_error=None, max_diversity=None,
                   copy_fields=None, copy_actions=None, delimiter=default_delimiter):
    """
    Pulls from data queue, performs calculations, and feeds results queue

    Arguments:
    alive = a multiprocessing.Value boolean controlling whether processing 
            continues; when False function returns
    data_queue = a multiprocessing.Queue holding data to process
    result_queue = a multiprocessing.Queue to hold processed results
    cons_func = the function to use for consensus generation 
    cons_args = a dictionary of optional arguments for the consensus function
    min_count = threshold number of sequences to define a consensus 
    primer_field = the annotation field containing primer names;
                   if None do not annotate with primer names
    primer_freq = the maximum primer frequency that must be meet to build a consensus;
                  if None do not filter by primer frequency
    max_gap = the maximum frequency of (., -) characters allowed before
              deleting a position; if None do not delete positions
    max_error = the minimum error rate to retain a set;
                if None do not calculate error rate
    max_diversity = a threshold defining the average pairwise error rate required to retain a read group;
                    if None do not calculate diversity
    copy_fields = a list of annotations to copy into consensus sequence annotations;
                  if None no additional annotations will be copied
    copy_actions = the list of actions to take for each copy_fields;
                   one of ['set', 'majority', 'min', 'max', 'sum']
    delimiter = a tuple of delimiters for (annotations, field/values, value lists) 

    Returns: 
    None
    """
    try:
        # Iterator over data queue until sentinel object reached
        while alive.value:
            # Get data from queue
            if data_queue.empty():  continue
            else:  data = data_queue.get()
            # Exit upon reaching sentinel
            if data is None:  break
            
            # Define result dictionary for iteration
            result = SeqResult(data.id, data.data)
            result.log['BARCODE'] = data.id
            result.log['SEQCOUNT'] = len(data)
    
            # Define primer annotations and consensus primer if applicable
            if primer_field is None:
                primer_ann = None
                seq_list = data.data
            else:
                # Calculate consensus primer            
                primer_ann = OrderedDict()
                prcons = annotationConsensus(data.data, primer_field, delimiter=delimiter)
                result.log['PRIMER'] = ','.join(prcons['set'])
                result.log['PRCOUNT'] = ','.join([str(c) for c in prcons['count']])
                result.log['PRCONS'] = prcons['cons']
                result.log['PRFREQ'] = prcons['freq']
                if primer_freq is None:
                    # Retain full sequence set if not in primer consensus mode
                    seq_list = data.data
                    primer_ann = mergeAnnotation(primer_ann, {'PRIMER':prcons['set']}, 
                                                 delimiter=delimiter)
                    primer_ann = mergeAnnotation(primer_ann, {'PRCOUNT':prcons['count']}, 
                                                 delimiter=delimiter)
                elif prcons['freq'] >= primer_freq:
                    # Define consensus subset
                    seq_list = subsetSeqSet(data.data, primer_field, prcons['cons'], 
                                            delimiter=delimiter)
                    primer_ann = mergeAnnotation(primer_ann, {'PRCONS':prcons['cons']}, 
                                                 delimiter=delimiter)
                    primer_ann = mergeAnnotation(primer_ann, {'PRFREQ':prcons['freq']}, 
                                                 delimiter=delimiter)
                else:
                    # If set fails primer consensus, feed result queue and continue
                    result_queue.put(result)
                    continue
    
            # Check count threshold
            cons_count = len(seq_list)
            result.log['CONSCOUNT'] = cons_count
            if cons_count < min_count:
                # If set fails count threshold, feed result queue and continue
                result_queue.put(result)
                continue

            # Update log with input sequences
            for i, s in enumerate(seq_list):
                result.log['INSEQ%i' % (i + 1)] = str(s.seq)

            # If primer and count filters pass, generate consensus sequence
            consensus = cons_func(seq_list, **cons_args)

            # Delete positions with gap frequency over max_gap and update log with consensus
            if max_gap is not None:
                gap_positions = set(findGapPositions(seq_list, max_gap))
                result.log['CONSENSUS'] = ''.join([' ' if i in gap_positions else x \
                                                   for i, x in enumerate(consensus.seq)])
                if 'phred_quality' in consensus.letter_annotations:
                    result.log['QUALITY'] = ''.join([' ' if i in gap_positions else chr(q + 33) \
                                                     for i, q in enumerate(consensus.letter_annotations['phred_quality'])])
                consensus = deleteSeqPositions(consensus, gap_positions)
            else:
                gap_positions = None
                result.log['CONSENSUS'] = str(consensus.seq)
                if 'phred_quality' in consensus.letter_annotations:
                    result.log['QUALITY'] = ''.join([chr(q + 33) for q in consensus.letter_annotations['phred_quality']])

            # Calculate nucleotide diversity
            if max_diversity is not None:
                diversity = calculateDiversity(seq_list)
                result.log['DIVERSITY'] = diversity

                # If diversity exceeds threshold, feed result queue and continue
                if diversity > max_diversity:
                    result_queue.put(result)
                    continue

            # Calculate set error against consensus
            if max_error is not None:
                # Delete positions if required and calculate error
                if gap_positions is not None:
                    seq_check = [deleteSeqPositions(s, gap_positions) for s in seq_list]
                else:
                    seq_check = seq_list
                error = calculateSetError(seq_check, consensus)
                result.log['ERROR'] = error

                # If error exceeds threshold, feed result queue and continue
                if error > max_error:
                    result_queue.put(result)
                    continue

            # TODO:  should move this into an improved annotationConsensus function with an action argument
            # Parse copy_field annotations and define consensus annotations
            if copy_fields is not None and copy_actions is not None:
                copy_ann = OrderedDict()
                for f, act in zip(copy_fields, copy_actions):
                    # Numeric operations
                    if act == 'min':
                        vals = getAnnotationValues(seq_list, f, delimiter=delimiter)
                        copy_ann[f] = '%.12g' % min([float(x or 0) for x in vals])
                    elif act == 'max':
                        vals = getAnnotationValues(seq_list, f, delimiter=delimiter)
                        copy_ann[f] = '%.12g' % max([float(x or 0) for x in vals])
                    elif act == 'sum':
                        vals = getAnnotationValues(seq_list, f, delimiter=delimiter)
                        copy_ann[f] = '%.12g' % sum([float(x or 0) for x in vals])
                    elif act == 'set':
                        vals = annotationConsensus(seq_list, f, delimiter=delimiter)
                        copy_ann[f] = vals['set']
                        copy_ann['%s_COUNT' % f] = vals['count']
                    elif act == 'majority':
                        vals = annotationConsensus(seq_list, f, delimiter=delimiter)
                        copy_ann[f] = vals['cons']
                        copy_ann['%s_FREQ' % f] = vals['freq']
            else:
                copy_ann = None

            # Define annotation for output sequence
            cons_ann = OrderedDict([('ID', data.id),
                                    ('CONSCOUNT', cons_count)])

            # Merge addition consensus annotations into output sequence annotations
            if primer_ann is not None:
                cons_ann = mergeAnnotation(cons_ann, primer_ann, delimiter=delimiter)
            if copy_ann is not None:
                cons_ann = mergeAnnotation(cons_ann, copy_ann, delimiter=delimiter)

            # Add output sequence annotations to consensus sequence
            consensus.id = consensus.name = flattenAnnotation(cons_ann, delimiter=delimiter)
            consensus.description = ''
            result.results = consensus
            result.valid = True

            # Feed results to result queue
            result_queue.put(result)
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None
    except:
        alive.value = False
        sys.stderr.write('Error processing sequence set with ID: %s\n' % data.id)
        raise
    
    return None
def pairSeq(seq_file_1, seq_file_2, fields_1=None, fields_2=None,
            coord_type=default_coord_type,
            out_args=default_out_args):
    """
    Generates consensus sequences

    Arguments: 
    seq_file_1 = the file containing the grouped sequences and annotations
    seq_file_2 = the file to assign annotations to from seq_file_1
    fields_1 = list of annotations in seq_file_1 records to copy to seq_file_2 records;
               if None do not copy any annotations
    fields_2 = list of annotations in seq_file_2 records to copy to seq_file_1 records;
               if None do not copy any annotations
    coord_type = the sequence header format
    out_args = common output argument dictionary from parseCommonArgs
                    
    Returns: 
    a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2)
    """
    # Define private functions
    def _key_func(x):
        return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter'])

    log = OrderedDict()
    log['START'] = 'PairSeq'
    log['FILE1'] = os.path.basename(seq_file_1)
    log['FILE2'] = os.path.basename(seq_file_2)
    log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None
    log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None
    log['COORD_TYPE'] = coord_type
    printLog(log)

    # Define output type
    if out_args['out_type'] is None:
        out_type_1 = getFileType(seq_file_1)
        out_type_2 = getFileType(seq_file_2)
    else: 
        out_type_1 = out_type_2 = out_args['out_type']

    # Define output name
    if out_args['out_name'] is None:
        out_name_1 = out_name_2 = None
    else: 
        out_name_1 = '%s-1' % out_args['out_name']
        out_name_2 = '%s-2' % out_args['out_name']

    # Open and count files
    start_time = time()
    printMessage("Indexing files", start_time=start_time)
    # Index file 1
    seq_count_1 = countSeqFile(seq_file_1)
    seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func)
    # Define file 2 iterator
    seq_count_2 = countSeqFile(seq_file_2)
    seq_iter_2 = readSeqFile(seq_file_2, index=False)
    printMessage("Done", start_time=start_time, end=True)

    # Open output file handles
    pass_handle_1 = getOutputHandle(seq_file_1, 'pair-pass', out_args['out_dir'], 
                                    out_name=out_name_1, out_type=out_type_1)
    pass_handle_2 = getOutputHandle(seq_file_2, 'pair-pass', out_args['out_dir'], 
                                    out_name=out_name_2, out_type=out_type_2)

    if out_args['failed']:
        fail_handle_1 = getOutputHandle(seq_file_1, 'pair-fail', out_dir=out_args['out_dir'],
                                        out_name=out_name_1, out_type=out_type_1)
        fail_handle_2 = getOutputHandle(seq_file_2, 'pair-fail', out_dir=out_args['out_dir'],
                                        out_name=out_name_2, out_type=out_type_2)
        pass_keys = list()

    # Iterate over pairs and write to output files
    start_time = time()
    rec_count = pair_count = 0
    for seq_2 in seq_iter_2:
        # Print progress for previous iteration
        printProgress(rec_count, seq_count_2, 0.05, start_time)
        rec_count += 1

        # Check for file 2 mate pair in file 1
        coord_2 = getCoordKey(seq_2.id, coord_type=coord_type,
                              delimiter=out_args['delimiter'])
        seq_1 = seq_dict_1.get(coord_2, None)

        if seq_1 is not None:
            # Record paired keys
            pair_count += 1

            if fields_1 is not None or fields_2 is not None:
                ann_1 = parseAnnotation(seq_1.description,
                                        delimiter=out_args['delimiter'])
                ann_2 = parseAnnotation(seq_2.description,
                                        delimiter=out_args['delimiter'])

                # Prepend annotations from seq_1 to seq_2
                if fields_1 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \
                                            if k in fields_1])
                    merge_ann = mergeAnnotation(ann_2, copy_ann, prepend=True,
                                                delimiter=out_args['delimiter'])
                    seq_2.id = flattenAnnotation(merge_ann,
                                                 delimiter=out_args['delimiter'])
                    seq_2.description = ''

                # Append annotations from seq_2 to seq_1
                if fields_2 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \
                                            if k in fields_2])
                    merge_ann = mergeAnnotation(ann_1, copy_ann, prepend=False,
                                                delimiter=out_args['delimiter'])
                    seq_1.id = flattenAnnotation(merge_ann,
                                                 delimiter=out_args['delimiter'])
                    seq_1.description = ''

            # Write paired records
            SeqIO.write(seq_1, pass_handle_1, out_type_1)
            SeqIO.write(seq_2, pass_handle_2, out_type_2)

        # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records
        if out_args['failed']:
            if seq_1 is not None:  pass_keys.append(coord_2)
            else:  SeqIO.write(seq_2, fail_handle_2, out_type_2)

    # Print final progress
    printProgress(rec_count, seq_count_2, 0.05, start_time)

    # Find and write unpaired file 1 records
    if out_args['failed']:
        start_time = time()
        printMessage("Finding unpaired", start_time=start_time)

        # Find file 1 unpaired keys
        pass_keys = set(pass_keys)
        unpaired = set(seq_dict_1).difference(pass_keys)
        # Write unpaired file 1 records
        for k in unpaired:  SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1)

        printMessage("Done", start_time=start_time, end=True)

    # Print log
    log = OrderedDict()
    log['OUTPUT1'] = os.path.basename(pass_handle_1.name)
    log['OUTPUT2'] = os.path.basename(pass_handle_2.name)
    log['SEQUENCES1'] = seq_count_1
    log['SEQUENCES2'] = seq_count_2
    log['PASS'] = pair_count
    log['END'] = 'PairSeq'
    printLog(log)
   
    # Close file handles
    pass_handle_1.close()
    pass_handle_2.close()

    return [(pass_handle_1.name, pass_handle_2.name)]
def processCSQueue(alive, data_queue, result_queue, cluster_field,
                  cluster_args={}, delimiter=default_delimiter):
    """
    Pulls from data queue, performs calculations, and feeds results queue

    Arguments:
    alive = a multiprocessing.Value boolean controlling whether processing
            continues; when False function returns
    data_queue = a multiprocessing.Queue holding data to process
    result_queue = a multiprocessing.Queue to hold processed results
    cluster_args = a dictionary of optional arguments for the clustering function
    delimiter = a tuple of delimiters for (annotations, field/values, value lists)

    Returns:
    None
    """
    try:
        # Iterator over data queue until sentinel object reached
        while alive.value:
            # Get data from queue
            if data_queue.empty():  continue
            else:  data = data_queue.get()
            # Exit upon reaching sentinel
            if data is None:  break

            # Define result object
            result = SeqResult(data.id, data.data)
            result.log['BARCODE'] = data.id
            result.log['SEQCOUNT'] = len(data)

            # Perform clustering
            cluster_dict = runUClust(data.data, **cluster_args)

            # Process failed result
            if cluster_dict is None:
                # Update log
                result.log['CLUSTERS'] = 0
                for i, seq in enumerate(data.data, start=1):
                    result.log['CLUST0-%i' % i] = str(seq.seq)

                # Feed results queue and continue
                result_queue.put(result)
                continue

            # Get number of clusters
            result.log['CLUSTERS'] = len(cluster_dict)

            # Update sequence annotations with cluster assignments
            results = list()
            seq_dict = {s.id: s for s in data.data}
            for clust, id_list in cluster_dict.items():
                for i, seq_id in enumerate(id_list, start=1):
                    # Add cluster annotation
                    seq = seq_dict[seq_id]
                    header = parseAnnotation(seq.description, delimiter=delimiter)
                    header = mergeAnnotation(header, {cluster_field:clust}, delimiter=delimiter)
                    seq.id = seq.name = flattenAnnotation(header, delimiter=delimiter)
                    seq.description = ''

                    # Update log and results
                    result.log['CLUST%i-%i' % (clust, i)] = str(seq.seq)
                    results.append(seq)

            # Check results
            result.results = results
            result.valid = (len(results) == len(seq_dict))

            # Feed results to result queue
            result_queue.put(result)
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())

            return None
    except:
        alive.value = False
        sys.stderr.write('Error processing sequence set with ID: %s.\n' % data.id)

        raise

    return None
def modifyHeaders(seq_file, modify_func, modify_args, out_args=default_out_args):
    """
    Modifies sequence headers

    Arguments: 
    seq_file = the sequence file name
    modify_func = the function defining the modification operation
    modify_args = a dictionary of arguments to pass to modify_func
    out_args = common output argument dictionary from parseCommonArgs
                    
    Returns: 
    the output file name
    """
    # Define subcommand label dictionary
    cmd_dict = {addHeader: 'add',
                copyHeader: 'copy',
                collapseHeader: 'collapse',
                deleteHeader: 'delete',
                expandHeader: 'expand',
                renameHeader: 'rename'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'ParseHeaders'
    log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__)
    log['FILE'] = os.path.basename(seq_file)
    for k in sorted(modify_args):  
        v = modify_args[k]
        log[k.upper()] = ','.join(v) if isinstance(v, list) else v
    printLog(log)
    
    # Open file handles
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type
    out_handle = getOutputHandle(seq_file, 'reheader', out_dir=out_args['out_dir'],
                                 out_name=out_args['out_name'], out_type=out_args['out_type'])

    # Count records
    result_count = countSeqFile(seq_file)
    
    # Iterate over sequences
    start_time = time()
    seq_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, result_count, 0.05, start_time)
        
        #Update counts
        seq_count += 1
        
        # Modify header
        header = parseAnnotation(seq.description, delimiter=out_args['delimiter'])
        header = modify_func(header, delimiter=out_args['delimiter'], **modify_args)
        
        # Write new sequence
        seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter'])
        seq.description = ''
        SeqIO.write(seq, out_handle, out_args['out_type'])
        
    # print counts
    printProgress(seq_count, result_count, 0.05, start_time)    
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['SEQUENCES'] = seq_count
    log['END'] = 'ParseHeaders'               
    printLog(log)

    # Close file handles
    out_handle.close()
 
    return out_handle.name
Exemple #9
0
def modifyHeaders(seq_file, modify_func, modify_args, out_file=None, out_args=default_out_args):
    """
    Modifies sequence headers

    Arguments: 
      seq_file : the sequence file name.
      modify_func : the function defining the modification operation.
      modify_args : a dictionary of arguments to pass to modify_func.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
                    
    Returns: 
      str: output file name.
    """
    # Define subcommand label dictionary
    cmd_dict = {addHeader: 'add',
                copyHeader: 'copy',
                collapseHeader: 'collapse',
                deleteHeader: 'delete',
                expandHeader: 'expand',
                renameHeader: 'rename'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'ParseHeaders'
    log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__)
    log['FILE'] = os.path.basename(seq_file)
    for k in sorted(modify_args):  
        v = modify_args[k]
        log[k.upper()] = ','.join(v) if isinstance(v, list) else v
    printLog(log)
    
    # Open file handles
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type
    if out_file is not None:
        out_handle = open(out_file, 'w')
    else:
        out_handle = getOutputHandle(seq_file,
                                     'reheader',
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type=out_args['out_type'])
    # Count records
    result_count = countSeqFile(seq_file)

    # Iterate over sequences
    start_time = time()
    seq_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, result_count, 0.05, start_time=start_time)
        
        #Update counts
        seq_count += 1
        
        # Modify header
        header = parseAnnotation(seq.description, delimiter=out_args['delimiter'])
        header = modify_func(header, delimiter=out_args['delimiter'], **modify_args)
        
        # Write new sequence
        seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter'])
        seq.description = ''
        SeqIO.write(seq, out_handle, out_args['out_type'])
        
    # print counts
    printProgress(seq_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['SEQUENCES'] = seq_count
    log['END'] = 'ParseHeaders'               
    printLog(log)

    # Close file handles
    out_handle.close()
 
    return out_handle.name
Exemple #10
0
def convertHeaders(seq_file, convert_func, convert_args={}, out_file=None, out_args=default_out_args):
    """
    Converts sequence headers to the pRESTO format

    Arguments:
      seq_file : the sequence file name.
      convert_func : the function used to convert sequence headers.
      convert_args : a dictionary of arguments to pass to convert_func.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      str: the output sequence file name.
    """
    # Define subcommand label dictionary
    cmd_dict = {convertGenericHeader: 'generic',
                convert454Header: '454',
                convertGenbankHeader: 'genbank',
                convertIlluminaHeader: 'illumina',
                convertIMGTHeader: 'imgt',
                convertMIGECHeader: 'migec',
                convertSRAHeader: 'sra'}

    log = OrderedDict()
    log['START'] = 'ConvertHeaders'
    log['COMMAND'] = cmd_dict[convert_func]
    log['FILE'] = os.path.basename(seq_file)
    printLog(log)

    # Open input file
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type

    # Wrapper for opening handles and writers
    def _open(x, out_file=out_file):
        if out_file is not None and x == 'pass':
            handle = open(out_file, 'w')
        else:
            handle = getOutputHandle(seq_file,
                                     'convert-%s' % x,
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type=out_args['out_type'])
        return handle

    # Count records
    result_count = countSeqFile(seq_file)

    # Set additional conversion arguments
    if convert_func in [convertGenericHeader, convertGenbankHeader]:
        convert_args.update({'delimiter': out_args['delimiter']})

    # Intialize file handles
    pass_handle, fail_handle = None, None

    # Iterate over sequences
    start_time = time()
    seq_count = pass_count = fail_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration and update count
        printProgress(seq_count, result_count, 0.05, start_time=start_time)
        seq_count += 1

        # Convert header
        header = convert_func(seq.description, **convert_args)

        if header is not None:
            # Write successfully converted sequences
            pass_count += 1
            seq.id = seq.name = flattenAnnotation(header, out_args['delimiter'])
            seq.description = ''
            try:
                SeqIO.write(seq, pass_handle, out_args['out_type'])
            except AttributeError:
                # Open output file
                pass_handle = _open('pass')
                SeqIO.write(seq, pass_handle, out_args['out_type'])
        else:
            fail_count += 1
            if out_args['failed']:
                # Write unconverted sequences
                try:
                    SeqIO.write(seq, fail_handle, out_args['out_type'])
                except AttributeError:
                    # Open output file
                    pass_handle = _open('fail')
                    SeqIO.write(seq, fail_handle, out_args['out_type'])

    # Print counts
    printProgress(seq_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None
    log['SEQUENCES'] = seq_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'ConvertHeaders'
    printLog(log)

    # Close file handles
    if fail_handle is not None:  pass_handle.close()
    if fail_handle is not None:  fail_handle.close()

    return pass_handle.name
Exemple #11
0
def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None,
                copy_fields=None, copy_actions=None, max_field=None, min_field=None, 
                inner=False, keep_missing=False, out_file=None, out_args=default_out_args):
    """
    Removes duplicate sequences from a file

    Arguments: 
      seq_file : filename of the sequence file to sample from.
      max_missing : number of ambiguous characters to allow in a unique sequence.
      uniq_fields : a list of annotations that define a sequence as unique if they differ.
      copy_fields : a list of annotations to copy into unique sequence annotations.
      copy_actions : the list of collapseAnnotation actions to take on copy_fields.
      max_field : a numeric field whose maximum value determines the retained sequence.
      min_field : a numeric field whose minimum value determines the retained sequence.
      inner : if True exclude consecutive outer ambiguous characters from iterations and matching.
      keep_missing : if True retain sequences with more ambiguous characters than max_missing as unique.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
              
    Returns: 
      str: the collapsed output file name.
    """
    log = OrderedDict()
    log['START'] = 'CollapseSeq'
    log['FILE'] = os.path.basename(seq_file)
    log['MAX_MISSING'] = max_missing
    log['UNIQ_FIELDS'] = ','.join([str(x) for x in uniq_fields]) \
                         if uniq_fields is not None else None
    log['COPY_FIELDS'] = ','.join([str(x) for x in copy_fields]) \
                         if copy_fields is not None else None
    log['COPY_ACTIONS'] = ','.join([str(x) for x in copy_actions]) \
                          if copy_actions is not None else None
    log['MAX_FIELD'] = max_field
    log['MIN_FIELD'] = min_field
    log['INNER'] = inner
    log['KEEP_MISSING'] = keep_missing
    printLog(log)
    
    # Read input file
    in_type = getFileType(seq_file)
    seq_dict = SeqIO.to_dict(readSeqFile(seq_file, index=False))
    if out_args['out_type'] is None:  out_args['out_type'] = in_type

    # Count total sequences
    rec_count = len(seq_dict)

    # Open unique record output handle
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(seq_file,
                                      'collapse-unique',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
    # Define log handle
    if out_args['log_file'] is None:  
        log_handle = None
    else:  
        log_handle = open(out_args['log_file'], 'w')

    # Find sequences with duplicates
    uniq_dict = {}
    # Added list typing for compatibility issue with Python 2.7.5 on OS X
    # TypeError: object of type 'dictionary-keyiterator' has no len()
    search_keys = list(seq_dict.keys())
    dup_keys = []
    for n in range(0, max_missing + 1):
        # Find unique sequences
        uniq_dict, search_keys, dup_list = findUniqueSeq(uniq_dict, search_keys, seq_dict, n, 
                                                         uniq_fields, copy_fields,
                                                         max_field, min_field, inner, 
                                                         out_args['delimiter'])

        # Update list of duplicates
        dup_keys.extend(dup_list)
                
        # Break if no keys to search remain
        if len(search_keys) == 0:  break

    # Write unique sequences
    for val in uniq_dict.values():
        # Define output sequence
        out_seq = val.seq
        out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter'])
        out_app = OrderedDict()
        if copy_fields  is not None and copy_actions is not None:
            for f, a in zip(copy_fields, copy_actions):
                x = collapseAnnotation(val.annotations, a, f, delimiter=out_args['delimiter'])
                out_app[f] = x[f]
                out_ann.pop(f, None)
        out_app['DUPCOUNT'] = val.count
        out_ann = mergeAnnotation(out_ann, out_app, delimiter=out_args['delimiter'])
        out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter'])
        out_seq.description = ''
        # Write unique sequence
        SeqIO.write(out_seq, pass_handle, out_args['out_type'])

        # Update log
        log = OrderedDict()
        log['HEADER'] = out_seq.id
        log['DUPCOUNT'] = val.count
        for i, k in enumerate(val.keys, start=1):
            log['ID%i' % i] = k
        for i, k in enumerate(val.keys, start=1):
            log['SEQ%i' % i] = str(seq_dict[k].seq)
        printLog(log, handle=log_handle)

    # Write sequence with high missing character counts
    if keep_missing:
        for k in search_keys:
            out_seq = seq_dict[k]
            out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter'])
            out_ann = mergeAnnotation(out_ann, {'DUPCOUNT':1}, delimiter=out_args['delimiter'])
            out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter'])
            out_seq.description = ''
            SeqIO.write(out_seq, pass_handle, out_args['out_type'])

    # Write sequence with high missing character counts
    if out_args['failed'] and not keep_missing:
        with getOutputHandle(seq_file, 'collapse-undetermined', out_dir=out_args['out_dir'],
                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
                as missing_handle:
            for k in search_keys:
                SeqIO.write(seq_dict[k], missing_handle, out_args['out_type'])

    if out_args['failed']:
        # Write duplicate sequences 
        with getOutputHandle(seq_file, 'collapse-duplicate', out_dir=out_args['out_dir'], 
                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
                as dup_handle:
            for k in dup_keys:
                SeqIO.write(seq_dict[k], dup_handle, out_args['out_type'])

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['SEQUENCES'] = rec_count
    log['UNIQUE'] = len(uniq_dict)
    log['DUPLICATE'] = len(dup_keys)
    log['UNDETERMINED'] = len(search_keys)
    log['END'] = 'CollapseSeq'
    printLog(log)
        
    # Close file handles
    pass_file = pass_handle.name
    if pass_handle is not None:  pass_handle.close()
    if log_handle is not None:  log_handle.close()
    
    return pass_file
def convertHeaders(seq_file, convert_func, convert_args={}, out_args=default_out_args):
    """
    Converts sequence headers to the pRESTO format

    Arguments:
    seq_file = the sequence file name
    convert_func = the function used to convert sequence headers
    convert_args = a dictionary of arguments to pass to convert_func
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    the output sequence file name
    """
    # Define subcommand label dictionary
    cmd_dict = {convertGenericHeader:'generic',
                convert454Header:'454',
                convertGenbankHeader:'genbank',
                convertIlluminaHeader:'illumina',
                convertIMGTHeader:'imgt',
                convertSRAHeader:'sra'}

    log = OrderedDict()
    log['START'] = 'ConvertHeaders'
    log['COMMAND'] = cmd_dict[convert_func]
    log['FILE'] = os.path.basename(seq_file)
    printLog(log)

    # Open input file
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type

    # Count records
    result_count = countSeqFile(seq_file)

    # Open output file handles
    pass_handle = getOutputHandle(seq_file,
                                  'convert-pass',
                                  out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'],
                                  out_type=out_args['out_type'])
    if out_args['failed']:
        fail_handle = getOutputHandle(seq_file,
                                      'convert-fail',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
    else:
        fail_handle = None

    # Set additional conversion arguments
    if convert_func in [convertGenericHeader, convertGenbankHeader]:
        convert_args.update({'delimiter':out_args['delimiter']})

    # Iterate over sequences
    start_time = time()
    seq_count = pass_count = fail_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration and update count
        printProgress(seq_count, result_count, 0.05, start_time)
        seq_count += 1

        # Convert header
        header = convert_func(seq.description, **convert_args)

        if header is not None:
            # Write successfully converted sequences
            pass_count += 1
            seq.id = seq.name = flattenAnnotation(header, out_args['delimiter'])
            seq.description = ''
            SeqIO.write(seq, pass_handle, out_args['out_type'])
        else:
            fail_count += 1
            if fail_handle is not None:
                # Write successfully unconverted sequences
                SeqIO.write(seq, fail_handle, out_args['out_type'])

    # Print counts
    printProgress(seq_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['SEQUENCES'] = seq_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'ConvertHeaders'
    printLog(log)

    # Close file handles
    pass_handle.close()
    if fail_handle is not None:  fail_handle.close()

    return pass_handle.name
def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None,
                copy_fields=None, copy_actions=None, max_field=None, min_field=None, 
                inner=False, keep_missing=False, out_args=default_out_args):
    """
    Removes duplicate sequences from a file

    Arguments: 
    seq_file = filename of the sequence file to sample from
    max_missing = number of ambiguous characters to allow in a unique sequence
    uniq_fields = a list of annotations that define a sequence as unique if they differ
    copy_fields = a list of annotations to copy into unique sequence annotations
    copy_actions = the list of collapseAnnotation actions to take on copy_fields 
    max_field = a numeric field whose maximum value determines the retained sequence
    min_field = a numeric field whose minimum value determines the retained sequence
    inner = if True exclude consecutive outer ambiguous characters from iterations and matching
    keep_missing = if True retain sequences with more ambiguous characters than max_missing as unique
    out_args = common output argument dictionary from parseCommonArgs
              
    Returns: 
    the collapsed output file name
    """
    log = OrderedDict()
    log['START'] = 'CollapseSeq'
    log['FILE'] = os.path.basename(seq_file)
    log['MAX_MISSING'] = max_missing
    log['UNIQ_FIELDS'] = ','.join([str(x) for x in uniq_fields]) \
                         if uniq_fields is not None else None
    log['COPY_FIELDS'] = ','.join([str(x) for x in copy_fields]) \
                         if copy_fields is not None else None
    log['COPY_ACTIONS'] = ','.join([str(x) for x in copy_actions]) \
                          if copy_actions is not None else None
    log['MAX_FIELD'] = max_field
    log['MIN_FIELD'] = min_field
    log['INNER'] = inner
    log['KEEP_MISSING'] = keep_missing
    printLog(log)
    
    # TODO:  storing all sequences in memory is faster
    # Read input file
    in_type = getFileType(seq_file)
    #seq_dict = readSeqFile(seq_file, index=True)
    seq_dict = SeqIO.to_dict(readSeqFile(seq_file, index=False))
    if out_args['out_type'] is None:  out_args['out_type'] = in_type

    # Count total sequences
    rec_count = len(seq_dict)

    # Define log handle
    if out_args['log_file'] is None:  
        log_handle = None
    else:  
        log_handle = open(out_args['log_file'], 'w')

    # Find sequences with duplicates
    uniq_dict = {}
    # Added list typing for compatibility issue with Python 2.7.5 on OS X
    # TypeError: object of type 'dictionary-keyiterator' has no len()
    search_keys = list(seq_dict.keys())
    dup_keys = []
    for n in range(0, max_missing + 1):
        # Find unique sequences
        uniq_dict, search_keys, dup_list = findUniqueSeq(uniq_dict, search_keys, seq_dict, n, 
                                                         uniq_fields, copy_fields,
                                                         max_field, min_field, inner, 
                                                         out_args['delimiter'])
        # Update list of duplicates
        dup_keys.extend(dup_list)

        # Update log
        log = OrderedDict()
        log['ITERATION'] = n + 1
        log['MISSING'] = n 
        log['UNIQUE'] = len(uniq_dict) 
        log['DUPLICATE'] = len(dup_keys) 
        log['UNDETERMINED'] = len(search_keys)
        printLog(log, handle=log_handle)
                
        # Break if no keys to search remain
        if len(search_keys) == 0:  break
    
    # Write unique sequences
    with getOutputHandle(seq_file, 'collapse-unique', out_dir=out_args['out_dir'], 
                         out_name=out_args['out_name'], out_type=out_args['out_type']) \
            as uniq_handle:
        for val in uniq_dict.values():
            # Define output sequence
            out_seq = val[0]
            out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter'])
            out_app = OrderedDict()
            if copy_fields  is not None and copy_actions is not None:
                for f, a, s in zip(copy_fields, copy_actions, val[3:]):
                    out_app[f] = s
                    out_app = collapseAnnotation(out_app, a, f, delimiter=out_args['delimiter'])
                    out_ann.pop(f, None)
            out_app['DUPCOUNT'] = val[1]
            out_ann = mergeAnnotation(out_ann, out_app, delimiter=out_args['delimiter'])
            out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter'])
            out_seq.description = ''
            # Write unique sequence
            SeqIO.write(out_seq, uniq_handle, out_args['out_type'])
    
        # Write sequence with high missing character counts
        if keep_missing:
            for k in search_keys:
                out_seq = seq_dict[k]
                out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter'])
                out_ann = mergeAnnotation(out_ann, {'DUPCOUNT':1}, delimiter=out_args['delimiter'])
                out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter'])
                out_seq.description = ''
                SeqIO.write(out_seq, uniq_handle, out_args['out_type'])

    # Write sequence with high missing character counts
    if out_args['failed'] and not keep_missing:
        with getOutputHandle(seq_file, 'collapse-undetermined', out_dir=out_args['out_dir'],
                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
                as missing_handle:
            for k in search_keys:
                SeqIO.write(seq_dict[k], missing_handle, out_args['out_type'])

    if out_args['failed']:
        # Write duplicate sequences 
        with getOutputHandle(seq_file, 'collapse-duplicate', out_dir=out_args['out_dir'], 
                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
                as dup_handle:
            for k in dup_keys:
                SeqIO.write(seq_dict[k], dup_handle, out_args['out_type'])

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(uniq_handle.name)
    log['SEQUENCES'] = rec_count
    log['UNIQUE'] = len(uniq_dict)
    log['DUPLICATE'] = len(dup_keys)
    log['UNDETERMINED'] = len(search_keys)
    log['END'] = 'CollapseSeq'
    printLog(log)
        
    # Close file handles
    if log_handle is not None:  log_handle.close()
    
    return uniq_handle.name
def processAssembly(data, assemble_func, assemble_args={}, rc=None,
                   fields_1=None, fields_2=None, delimiter=default_delimiter):
    """
    Performs assembly of a sequence pair

    Arguments:
    data = a SeqData object with a list of exactly two SeqRecords
    assemble_func = the function to use to assemble paired ends
    assemble_args = a dictionary of arguments to pass to the assembly function
    rc = Defines which sequences ('head','tail','both') to reverse complement
         before assembly; if None do not reverse complement sequences
    fields_1 = list of annotations in head SeqRecord to copy to assembled record;
               if None do not copy an annotation
    fields_2 = list of annotations in tail SeqRecord to copy to assembled record;
               if None do not copy an annotation
    delimiter = a tuple of delimiters for (fields, values, value lists)

    Returns:
    a SeqResult object
    """
    # Reverse complement sequences if required
    head_seq = data.data[0] if rc not in ('head', 'both') \
               else reverseComplement(data.data[0])
    tail_seq = data.data[1] if rc not in ('tail', 'both') \
               else reverseComplement(data.data[1])

    # Define result object
    result = SeqResult(data.id, [head_seq, tail_seq])

    # Define stitched sequence annotation
    stitch_ann = OrderedDict([('ID', data.id)])
    if fields_1 is not None:
        head_ann = parseAnnotation(head_seq.description, fields_1,
                                   delimiter=delimiter)
        stitch_ann = mergeAnnotation(stitch_ann, head_ann, delimiter=delimiter)
        result.log['FIELDS1'] = '|'.join(['%s=%s' % (k, v)
                                             for k, v in head_ann.items()])
    if fields_2 is not None:
        tail_ann = parseAnnotation(tail_seq.description, fields_2,
                                   delimiter=delimiter)
        stitch_ann = mergeAnnotation(stitch_ann, tail_ann, delimiter=delimiter)
        result.log['FIELDS2'] = '|'.join(['%s=%s' % (k, v)
                                             for k, v in tail_ann.items()])

    # Assemble sequences
    stitch = assemble_func(head_seq, tail_seq, **assemble_args)
    ab = stitch.head_pos
    xy = stitch.tail_pos
    result.valid = stitch.valid

    # Add reference to log
    if stitch.ref_seq is not None and stitch.ref_pos is not None:
        result.log['REFID'] = stitch.ref_seq.id
        result.log['REFSEQ'] = ' ' * stitch.ref_pos[0] + stitch.ref_seq.seq

    if ab is not None and xy is not None:
        result.log['SEQ1'] = ' ' * xy[0] + head_seq.seq
        result.log['SEQ2'] = ' ' * ab[0] + tail_seq.seq
    else:
        result.log['SEQ1'] = head_seq.seq
        result.log['SEQ2'] = ' ' * (len(head_seq) + (stitch.gap or 0)) + tail_seq.seq

    # Define stitching log
    if stitch.seq is not None:
        # Update stitch annotation
        stitch.seq.id = flattenAnnotation(stitch_ann, delimiter=delimiter)
        stitch.seq.name = stitch.seq.id
        stitch.seq.description = ''
        result.results = stitch.seq
        # Add assembly to log
        result.log['ASSEMBLY'] = stitch.seq.seq
        if 'phred_quality' in stitch.seq.letter_annotations:
            result.log['QUALITY'] = ''.join([chr(q+33) for q in
                                             stitch.seq.letter_annotations['phred_quality']])
        result.log['LENGTH'] = len(stitch)
        result.log['OVERLAP'] = stitch.overlap
    else:
        result.log['ASSEMBLY'] = None

    # Add mode specific log results
    if stitch.gap is not None:
        result.log['GAP'] = stitch.gap
    if stitch.error is not None:
        result.log['ERROR'] = '%.4f' % stitch.error
    if stitch.pvalue is not None:
        result.log['PVALUE'] = '%.4e' % stitch.pvalue
    if stitch.evalue is not None:
        result.log['EVALUE1'] = '%.4e' % stitch.evalue[0]
        result.log['EVALUE2'] = '%.4e' % stitch.evalue[1]
    if stitch.ident is not None:
        result.log['IDENTITY'] = '%.4f' % stitch.ident


    return result
Exemple #15
0
def assemblyWorker(data,
                   assemble_func,
                   assemble_args={},
                   rc='tail',
                   fields_1=None,
                   fields_2=None,
                   delimiter=default_delimiter):
    """
    Performs assembly of a sequence pair

    Arguments:
      data : a SeqData object with a list of exactly two SeqRecords.
      assemble_func : the function to use to assemble paired ends.
      assemble_args : a dictionary of arguments to pass to the assembly function.
      rc : Defines which sequences ('head', 'tail', 'both', 'none') to reverse complement
           before assembly; if None do not reverse complement sequences.
      fields_1 : list of annotations in head SeqRecord to copy to assembled record;
                 if None do not copy an annotation.
      fields_2 : list of annotations in tail SeqRecord to copy to assembled record;
                 if None do not copy an annotation.
      delimiter : a tuple of delimiters for (fields, values, value lists).

    Returns:
      SeqResult: a SeqResult object
    """
    # Define result object
    result = SeqResult(data.id, data.data)

    # Reverse complement sequences if required
    head_seq = data.data[0] if rc not in ('head', 'both') \
               else reverseComplement(data.data[0])
    tail_seq = data.data[1] if rc not in ('tail', 'both') \
               else reverseComplement(data.data[1])

    # Define stitched sequence annotation
    stitch_ann = OrderedDict([('ID', data.id)])
    if fields_1 is not None:
        head_ann = parseAnnotation(head_seq.description,
                                   fields_1,
                                   delimiter=delimiter)
        stitch_ann = mergeAnnotation(stitch_ann, head_ann, delimiter=delimiter)
        result.log['FIELDS1'] = '|'.join(
            ['%s=%s' % (k, v) for k, v in head_ann.items()])
    if fields_2 is not None:
        tail_ann = parseAnnotation(tail_seq.description,
                                   fields_2,
                                   delimiter=delimiter)
        stitch_ann = mergeAnnotation(stitch_ann, tail_ann, delimiter=delimiter)
        result.log['FIELDS2'] = '|'.join(
            ['%s=%s' % (k, v) for k, v in tail_ann.items()])

    # Assemble sequences
    stitch = assemble_func(head_seq, tail_seq, **assemble_args)
    ab = stitch.head_pos
    xy = stitch.tail_pos
    result.valid = stitch.valid

    # Add reference to log
    if stitch.ref_seq is not None and stitch.ref_pos is not None:
        result.log['REFID'] = stitch.ref_seq.id
        result.log['REFSEQ'] = ' ' * stitch.ref_pos[0] + stitch.ref_seq.seq

    if ab is not None and xy is not None:
        result.log['SEQ1'] = ' ' * xy[0] + head_seq.seq
        result.log['SEQ2'] = ' ' * ab[0] + tail_seq.seq
    else:
        result.log['SEQ1'] = head_seq.seq
        result.log['SEQ2'] = ' ' * (len(head_seq) +
                                    (stitch.gap or 0)) + tail_seq.seq

    # Define stitching log
    if stitch.seq is not None:
        # Update stitch annotation
        stitch.seq.id = flattenAnnotation(stitch_ann, delimiter=delimiter)
        stitch.seq.name = stitch.seq.id
        stitch.seq.description = ''
        result.results = stitch.seq
        # Add assembly to log
        result.log['ASSEMBLY'] = stitch.seq.seq
        if 'phred_quality' in stitch.seq.letter_annotations:
            result.log['QUALITY'] = ''.join([
                chr(q + 33)
                for q in stitch.seq.letter_annotations['phred_quality']
            ])
        result.log['LENGTH'] = len(stitch)
        result.log['OVERLAP'] = stitch.overlap
    else:
        result.log['ASSEMBLY'] = None

    # Add mode specific log results
    if stitch.gap is not None:
        result.log['GAP'] = stitch.gap
    if stitch.error is not None:
        result.log['ERROR'] = '%.4f' % stitch.error
    if stitch.pvalue is not None:
        result.log['PVALUE'] = '%.4e' % stitch.pvalue
    if stitch.evalue is not None:
        result.log['EVALUE1'] = '%.4e' % stitch.evalue[0]
        result.log['EVALUE2'] = '%.4e' % stitch.evalue[1]
    if stitch.ident is not None:
        result.log['IDENTITY'] = '%.4f' % stitch.ident

    return result
def getMaskedSeq(align, mode='mask', barcode=False, delimiter=default_delimiter):
    """
    Create an output sequence with primers masked or cut

    Arguments: 
    align = a PrimerAlignment object returned from alignPrimers or scorePrimers
    mode = defines the action taken; one of ['cut','mask','tag','trim']
    barcode = if True add sequence preceding primer to description
    delimiter = a tuple of delimiters for (annotations, field/values, value lists) 

    Returns:
    output SeqRecord object
    """
    seq = align.seq

    # Build output sequence
    if mode == 'tag' or not align.align_primer:
        # Do not modify sequence
        out_seq = seq
    elif mode == 'trim':
        # Remove region before primer
        if not align.rev_primer:
            out_seq = seq[align.start:]
        else:  
            out_seq = seq[:align.end]
    elif mode == 'cut':
        # Remove primer and preceding region
        if not align.rev_primer:
            out_seq = seq[align.end:]
        else: 
            out_seq = seq[:align.start]
    elif mode == 'mask':
        # Mask primer with Ns and remove preceding region
        if not align.rev_primer:
            mask_len = align.end - align.start + align.gaps
            out_seq = 'N' * mask_len + seq[align.end:]
            if hasattr(seq, 'letter_annotations') and \
                    'phred_quality' in seq.letter_annotations:
                out_seq.letter_annotations['phred_quality'] = \
                    [0] * mask_len + \
                    seq.letter_annotations['phred_quality'][align.end:]
        else:
            mask_len = min(align.end, len(seq)) - align.start + align.gaps
            out_seq = seq[:align.start] + 'N' * mask_len
            if hasattr(seq, 'letter_annotations') and \
                    'phred_quality' in seq.letter_annotations:
                out_seq.letter_annotations['phred_quality'] = \
                    seq.letter_annotations['phred_quality'][:align.start] + \
                    [0] * mask_len
            
    # Add alignment annotations to output SeqRecord
    out_seq.annotations = seq.annotations    
    out_seq.annotations['primer'] = align.primer
    out_seq.annotations['prstart'] = align.start
    out_seq.annotations['error'] = align.error

    # Parse seq annotation and create output annotation
    seq_ann = parseAnnotation(seq.description, delimiter=delimiter)
    out_ann = OrderedDict([('SEQORIENT', seq.annotations['seqorient']),
                           ('PRIMER', align.primer)])
    
    # Add ID sequence to description
    if barcode:
        seq_code = seq[:align.start].seq if not align.rev_primer \
                   else seq[align.end:].seq
        out_seq.annotations['barcode'] = seq_code
        out_ann['BARCODE'] = seq_code
    
    out_ann = mergeAnnotation(seq_ann, out_ann, delimiter=delimiter)
    out_seq.id = flattenAnnotation(out_ann, delimiter=delimiter)
    out_seq.description = ''

    return out_seq
Exemple #17
0
def processQueue(alive,
                 data_queue,
                 result_queue,
                 cluster_func,
                 cluster_args={},
                 cluster_field=default_cluster_field,
                 cluster_prefix=default_cluster_prefix,
                 delimiter=default_delimiter):
    """
    Pulls from data queue, performs calculations, and feeds results queue

    Arguments:
      alive : a multiprocessing.Value boolean controlling whether processing
              continues; when False function returns.
      data_queue : a multiprocessing.Queue holding data to process.
      result_queue : a multiprocessing.Queue to hold processed results.
      cluster_func : the function to use for clustering.
      cluster_args : a dictionary of optional arguments for the clustering function.
      cluster_field : string defining the output cluster field name.
      cluster_prefix : string defining a prefix for the cluster identifier.
      delimiter : a tuple of delimiters for (annotations, field/values, value lists).

    Returns:
      None
    """
    try:
        # Iterator over data queue until sentinel object reached
        while alive.value:
            # Get data from queue
            if data_queue.empty(): continue
            else: data = data_queue.get()
            # Exit upon reaching sentinel
            if data is None: break

            # Define result object
            result = SeqResult(data.id, data.data)
            result.log['BARCODE'] = data.id
            result.log['SEQCOUNT'] = len(data)

            # Perform clustering
            cluster_dict = cluster_func(data.data, **cluster_args)

            # Process failed result
            if cluster_dict is None:
                # Update log
                result.log['CLUSTERS'] = 0
                for i, seq in enumerate(data.data, start=1):
                    result.log['CLUST0-%i' % i] = str(seq.seq)

                # Feed results queue and continue
                result_queue.put(result)
                continue

            # Get number of clusters
            result.log['CLUSTERS'] = len(cluster_dict)

            # Update sequence annotations with cluster assignments
            results = list()
            seq_dict = {s.id: s for s in data.data}
            for cluster, id_list in cluster_dict.items():
                for i, seq_id in enumerate(id_list, start=1):
                    # Add cluster annotation
                    seq = seq_dict[seq_id]
                    label = '%s%i' % (cluster_prefix, cluster)
                    header = parseAnnotation(seq.description,
                                             delimiter=delimiter)
                    header = mergeAnnotation(header, {cluster_field: label},
                                             delimiter=delimiter)
                    seq.id = seq.name = flattenAnnotation(header,
                                                          delimiter=delimiter)
                    seq.description = ''

                    # Update log and results
                    result.log['CLUST%i-%i' % (cluster, i)] = str(seq.seq)
                    results.append(seq)

            # Check results
            result.results = results
            result.valid = (len(results) == len(seq_dict))
            # Feed results to result queue
            result_queue.put(result)
        else:
            sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())

            return None
    except:
        alive.value = False
        printError('Error processing sequence set with ID: %s.' % data.id,
                   exit=False)
        raise

    return None
Exemple #18
0
def processQueue(alive,
                 data_queue,
                 result_queue,
                 cons_func,
                 cons_args={},
                 min_count=default_consensus_min_count,
                 primer_field=None,
                 primer_freq=None,
                 max_gap=None,
                 max_error=None,
                 max_diversity=None,
                 copy_fields=None,
                 copy_actions=None,
                 delimiter=default_delimiter):
    """
    Pulls from data queue, performs calculations, and feeds results queue

    Arguments:
      alive : a multiprocessing.Value boolean controlling whether processing
              continues; when False function returns.
      data_queue : a multiprocessing.Queue holding data to process.
      result_queue : a multiprocessing.Queue to hold processed results.
      cons_func : the function to use for consensus generation.
      cons_args : a dictionary of optional arguments for the consensus function.
      min_count : threshold number of sequences to define a consensus.
      primer_field : the annotation field containing primer names;
                     if None do not annotate with primer names.
      primer_freq : the maximum primer frequency that must be meet to build a consensus;
                    if None do not filter by primer frequency.
      max_gap : the maximum frequency of (., -) characters allowed before
                deleting a position; if None do not delete positions.
      max_error : the minimum error rate to retain a set;
                  if None do not calculate error rate.
      max_diversity : a threshold defining the average pairwise error rate required to retain a read group;
                      if None do not calculate diversity.
      copy_fields : a list of annotations to copy into consensus sequence annotations;
                    if None no additional annotations will be copied.
      copy_actions : the list of actions to take for each copy_fields;
                     one of ['set', 'majority', 'min', 'max', 'sum'].
      delimiter : a tuple of delimiters for (annotations, field/values, value lists).

    Returns: 
      None
    """
    try:
        # Iterator over data queue until sentinel object reached
        while alive.value:
            # Get data from queue
            if data_queue.empty(): continue
            else: data = data_queue.get()
            # Exit upon reaching sentinel
            if data is None: break

            # Define result dictionary for iteration
            result = SeqResult(data.id, data.data)
            result.log['BARCODE'] = data.id
            result.log['SEQCOUNT'] = len(data)

            # Define primer annotations and consensus primer if applicable
            if primer_field is None:
                primer_ann = None
                seq_list = data.data
            else:
                # Calculate consensus primer
                primer_ann = OrderedDict()
                prcons = annotationConsensus(data.data,
                                             primer_field,
                                             delimiter=delimiter)
                result.log['PRIMER'] = ','.join(prcons['set'])
                result.log['PRCOUNT'] = ','.join(
                    [str(c) for c in prcons['count']])
                result.log['PRCONS'] = prcons['cons']
                result.log['PRFREQ'] = prcons['freq']
                if primer_freq is None:
                    # Retain full sequence set if not in primer consensus mode
                    seq_list = data.data
                    primer_ann = mergeAnnotation(primer_ann,
                                                 {'PRIMER': prcons['set']},
                                                 delimiter=delimiter)
                    primer_ann = mergeAnnotation(primer_ann,
                                                 {'PRCOUNT': prcons['count']},
                                                 delimiter=delimiter)
                elif prcons['freq'] >= primer_freq:
                    # Define consensus subset
                    seq_list = subsetSeqSet(data.data,
                                            primer_field,
                                            prcons['cons'],
                                            delimiter=delimiter)
                    primer_ann = mergeAnnotation(primer_ann,
                                                 {'PRCONS': prcons['cons']},
                                                 delimiter=delimiter)
                    primer_ann = mergeAnnotation(primer_ann,
                                                 {'PRFREQ': prcons['freq']},
                                                 delimiter=delimiter)
                else:
                    # If set fails primer consensus, feed result queue and continue
                    result_queue.put(result)
                    continue

            # Check count threshold
            cons_count = len(seq_list)
            result.log['CONSCOUNT'] = cons_count
            if cons_count < min_count:
                #print(cons_count, min_count)
                # If set fails count threshold, feed result queue and continue
                result_queue.put(result)
                continue

            # Update log with input sequences
            for i, s in enumerate(seq_list):
                result.log['INSEQ%i' % (i + 1)] = str(s.seq)

            # If primer and count filters pass, generate consensus sequence
            consensus = cons_func(seq_list, **cons_args)

            # Delete positions with gap frequency over max_gap and update log with consensus
            if max_gap is not None:
                gap_positions = set(findGapPositions(seq_list, max_gap))
                result.log['CONSENSUS'] = ''.join([' ' if i in gap_positions else x \
                                                   for i, x in enumerate(consensus.seq)])
                if 'phred_quality' in consensus.letter_annotations:
                    result.log['QUALITY'] = ''.join([' ' if i in gap_positions else chr(q + 33) \
                                                     for i, q in enumerate(consensus.letter_annotations['phred_quality'])])
                consensus = deleteSeqPositions(consensus, gap_positions)
            else:
                gap_positions = None
                result.log['CONSENSUS'] = str(consensus.seq)
                if 'phred_quality' in consensus.letter_annotations:
                    result.log['QUALITY'] = ''.join([
                        chr(q + 33)
                        for q in consensus.letter_annotations['phred_quality']
                    ])

            # Calculate nucleotide diversity
            if max_diversity is not None:
                diversity = calculateDiversity(seq_list)
                result.log['DIVERSITY'] = diversity

                # If diversity exceeds threshold, feed result queue and continue
                if diversity > max_diversity:
                    result_queue.put(result)
                    continue

            # Calculate set error against consensus
            if max_error is not None:
                # Delete positions if required and calculate error
                if gap_positions is not None:
                    seq_check = [
                        deleteSeqPositions(s, gap_positions) for s in seq_list
                    ]
                else:
                    seq_check = seq_list
                error = calculateSetError(seq_check, consensus)
                result.log['ERROR'] = error

                # If error exceeds threshold, feed result queue and continue
                if error > max_error:
                    result_queue.put(result)
                    continue

            # TODO:  should move this into an improved annotationConsensus function with an action argument
            # Parse copy_field annotations and define consensus annotations
            if copy_fields is not None and copy_actions is not None:
                copy_ann = OrderedDict()
                for f, act in zip(copy_fields, copy_actions):
                    # Numeric operations
                    if act == 'min':
                        vals = getAnnotationValues(seq_list,
                                                   f,
                                                   delimiter=delimiter)
                        copy_ann[f] = '%.12g' % min(
                            [float(x or 0) for x in vals])
                    elif act == 'max':
                        vals = getAnnotationValues(seq_list,
                                                   f,
                                                   delimiter=delimiter)
                        copy_ann[f] = '%.12g' % max(
                            [float(x or 0) for x in vals])
                    elif act == 'sum':
                        vals = getAnnotationValues(seq_list,
                                                   f,
                                                   delimiter=delimiter)
                        copy_ann[f] = '%.12g' % sum(
                            [float(x or 0) for x in vals])
                    elif act == 'set':
                        vals = annotationConsensus(seq_list,
                                                   f,
                                                   delimiter=delimiter)
                        copy_ann[f] = vals['set']
                        copy_ann['%s_COUNT' % f] = vals['count']
                    elif act == 'majority':
                        vals = annotationConsensus(seq_list,
                                                   f,
                                                   delimiter=delimiter)
                        copy_ann[f] = vals['cons']
                        copy_ann['%s_FREQ' % f] = vals['freq']
            else:
                copy_ann = None

            # Define annotation for output sequence
            cons_ann = OrderedDict([('ID', data.id),
                                    ('CONSCOUNT', cons_count)])

            # Merge addition consensus annotations into output sequence annotations
            if primer_ann is not None:
                cons_ann = mergeAnnotation(cons_ann,
                                           primer_ann,
                                           delimiter=delimiter)
            if copy_ann is not None:
                cons_ann = mergeAnnotation(cons_ann,
                                           copy_ann,
                                           delimiter=delimiter)

            # Add output sequence annotations to consensus sequence
            consensus.id = consensus.name = flattenAnnotation(
                cons_ann, delimiter=delimiter)
            consensus.description = ''
            result.results = consensus
            result.valid = True

            # Feed results to result queue
            result_queue.put(result)
        else:
            sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None
    except:
        alive.value = False
        printError('Processing sequence set with ID: %s' % data.id, exit=False)
        raise

    return None
Exemple #19
0
def pairSeq(seq_file_1,
            seq_file_2,
            fields_1=None,
            fields_2=None,
            action=None,
            coord_type=default_coord,
            out_args=default_out_args):
    """
    Syncronized paired end files and copies annotations between them

    Arguments: 
      seq_file_1 : the file containing the grouped sequences and annotations.
      seq_file_2 : the file to assign annotations to from seq_file_1.
      fields_1 : list of annotations in seq_file_1 records to copy to seq_file_2 records;
                 if None do not copy any annotations.
      fields_2 : list of annotations in seq_file_2 records to copy to seq_file_1 records;
                 if None do not copy any annotations.
      action : the collapse action to take on all copied annotation if they already exist in the
               target header.
      coord_type : the sequence header format.
      out_args : common output argument dictionary from parseCommonArgs.
                    
    Returns: 
      list: a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2).
    """

    # Define private functions
    def _key_func(x):
        return getCoordKey(x,
                           coord_type=coord_type,
                           delimiter=out_args['delimiter'])

    log = OrderedDict()
    log['START'] = 'PairSeq'
    log['FILE1'] = os.path.basename(seq_file_1)
    log['FILE2'] = os.path.basename(seq_file_2)
    log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None
    log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None
    log['COORD_TYPE'] = coord_type
    printLog(log)

    # Define output type
    if out_args['out_type'] is None:
        out_type_1 = getFileType(seq_file_1)
        out_type_2 = getFileType(seq_file_2)
    else:
        out_type_1 = out_type_2 = out_args['out_type']

    # Define output name
    if out_args['out_name'] is None:
        out_name_1 = out_name_2 = None
    else:
        out_name_1 = '%s-1' % out_args['out_name']
        out_name_2 = '%s-2' % out_args['out_name']

    # Open and count files
    start_time = time()
    printMessage("Indexing files", start_time=start_time)
    # Index file 1
    seq_count_1 = countSeqFile(seq_file_1)
    seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func)
    # Define file 2 iterator
    seq_count_2 = countSeqFile(seq_file_2)
    seq_iter_2 = readSeqFile(seq_file_2, index=False)
    printMessage("Done", start_time=start_time, end=True)

    # Open output file handles
    pass_handle_1 = getOutputHandle(seq_file_1,
                                    'pair-pass',
                                    out_args['out_dir'],
                                    out_name=out_name_1,
                                    out_type=out_type_1)
    pass_handle_2 = getOutputHandle(seq_file_2,
                                    'pair-pass',
                                    out_args['out_dir'],
                                    out_name=out_name_2,
                                    out_type=out_type_2)

    if out_args['failed']:
        fail_handle_1 = getOutputHandle(seq_file_1,
                                        'pair-fail',
                                        out_dir=out_args['out_dir'],
                                        out_name=out_name_1,
                                        out_type=out_type_1)
        fail_handle_2 = getOutputHandle(seq_file_2,
                                        'pair-fail',
                                        out_dir=out_args['out_dir'],
                                        out_name=out_name_2,
                                        out_type=out_type_2)
        pass_keys = list()

    # Iterate over pairs and write to output files
    start_time = time()
    rec_count = pair_count = 0
    for seq_2 in seq_iter_2:
        # Print progress for previous iteration
        printProgress(rec_count, seq_count_2, 0.05, start_time=start_time)
        rec_count += 1

        # Check for file 2 mate pair in file 1
        coord_2 = getCoordKey(seq_2.id,
                              coord_type=coord_type,
                              delimiter=out_args['delimiter'])
        seq_1 = seq_dict_1.get(coord_2, None)

        if seq_1 is not None:
            # Record paired keys
            pair_count += 1

            if fields_1 is not None or fields_2 is not None:
                ann_1 = parseAnnotation(seq_1.description,
                                        delimiter=out_args['delimiter'])
                ann_2 = parseAnnotation(seq_2.description,
                                        delimiter=out_args['delimiter'])

                # Prepend annotations from seq_1 to seq_2
                if fields_1 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \
                                            if k in fields_1])
                    merge_ann = mergeAnnotation(
                        ann_2,
                        copy_ann,
                        prepend=True,
                        delimiter=out_args['delimiter'])
                    # Collapse if necessary
                    if action is not None:
                        merge_ann = collapseAnnotation(
                            merge_ann,
                            action,
                            fields=fields_1,
                            delimiter=out_args['delimiter'])
                    # Flatten
                    seq_2.id = flattenAnnotation(
                        merge_ann, delimiter=out_args['delimiter'])
                    seq_2.description = ''

                # Append annotations from seq_2 to seq_1
                if fields_2 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \
                                            if k in fields_2])
                    merge_ann = mergeAnnotation(
                        ann_1,
                        copy_ann,
                        prepend=False,
                        delimiter=out_args['delimiter'])
                    # Collapse if necessary
                    if action is not None:
                        merge_ann = collapseAnnotation(
                            merge_ann,
                            action,
                            fields=fields_2,
                            delimiter=out_args['delimiter'])
                    # Flatten
                    seq_1.id = flattenAnnotation(
                        merge_ann, delimiter=out_args['delimiter'])
                    seq_1.description = ''

            # Write paired records
            SeqIO.write(seq_1, pass_handle_1, out_type_1)
            SeqIO.write(seq_2, pass_handle_2, out_type_2)

        # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records
        if out_args['failed']:
            if seq_1 is not None: pass_keys.append(coord_2)
            else: SeqIO.write(seq_2, fail_handle_2, out_type_2)

    # Print final progress
    printProgress(rec_count, seq_count_2, 0.05, start_time=start_time)

    # Find and write unpaired file 1 records
    if out_args['failed']:
        start_time = time()
        printMessage("Finding unpaired", start_time=start_time)

        # Find file 1 unpaired keys
        pass_keys = set(pass_keys)
        unpaired = set(seq_dict_1).difference(pass_keys)
        # Write unpaired file 1 records
        for k in unpaired:
            SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1)

        printMessage("Done", start_time=start_time, end=True)

    # Print log
    log = OrderedDict()
    log['OUTPUT1'] = os.path.basename(pass_handle_1.name)
    log['OUTPUT2'] = os.path.basename(pass_handle_2.name)
    log['SEQUENCES1'] = seq_count_1
    log['SEQUENCES2'] = seq_count_2
    log['PASS'] = pair_count
    log['END'] = 'PairSeq'
    printLog(log)

    # Close file handles
    pass_handle_1.close()
    pass_handle_2.close()

    return [(pass_handle_1.name, pass_handle_2.name)]