Esempio n. 1
0
def parseIgBLAST(aligner_output, seq_file, repo, no_parse=True, partial=False,
                 parse_regions=False, parse_scores=False, parse_igblast_cdr3=False,
                 out_args=default_out_args):
    """
    Main for IgBLAST aligned sample sequences.

    Arguments:
      aligner_output : IgBLAST output file to process.
      seq_file : fasta file input to IgBlast (from which to get sequence).
      repo : folder with germline repertoire files.
      no_parse : if ID is to be parsed for pRESTO output with default delimiters.
      partial : If True put incomplete alignments in the pass file.
      parse_regions : if True add FWR and CDR fields to output file.
      parse_scores : if True add alignment score fields to output file.
      parse_igblast_cdr3 : if True parse CDR3 sequences generated by IgBLAST
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      None
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'MakeDB'
    log['ALIGNER'] = 'IgBlast'
    log['ALIGNER_OUTPUT'] = os.path.basename(aligner_output)
    log['SEQ_FILE'] = os.path.basename(seq_file)
    log['NO_PARSE'] = no_parse
    log['PARTIAL'] = partial
    log['SCORES'] = parse_scores
    log['REGIONS'] = parse_regions
    printLog(log)

    start_time = time()
    printMessage('Loading sequence files', start_time=start_time, width=25)
    # Count records in sequence file
    total_count = countSeqFile(seq_file)
    # Get input sequence dictionary
    seq_dict = getSeqDict(seq_file)
    # Create germline repo dictionary
    repo_dict = readRepo(repo)
    printMessage('Done', start_time=start_time, end=True, width=25)

    # Parse and write output
    with open(aligner_output, 'r') as f:
        parse_iter = IgBLASTReader(f, seq_dict, repo_dict,
                                   parse_scores=parse_scores, parse_regions=parse_regions,
                                   parse_igblast_cdr3=parse_igblast_cdr3)
        file_prefix = getFilePrefix(aligner_output, out_args)
        writeDb(parse_iter, parse_iter.fields, file_prefix, total_count,
                no_parse=no_parse, partial=partial, out_args=out_args)

    return None
Esempio n. 2
0
def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False,
                 region_fields=False, out_args=default_out_args):
    """
    Main for IgBlast aligned sample sequences

    Arguments:
    igblast_output = IgBlast output file to process
    seq_file = fasta file input to IgBlast (from which to get sequence)
    repo = folder with germline repertoire files
    no_parse = if ID is to be parsed for pRESTO output with default delimiters
    score_fields = if True add alignment score fields to output file
    region_fields = if True add FWR and CDR region fields to output file
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    None
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'MakeDB'
    log['ALIGNER'] = 'IgBlast'
    log['ALIGN_RESULTS'] = os.path.basename(igblast_output)
    log['SEQ_FILE'] = os.path.basename(seq_file)
    log['NO_PARSE'] = no_parse
    log['SCORE_FIELDS'] = score_fields
    log['REGION_FIELDS'] = region_fields
    printLog(log)

    # Get input sequence dictionary
    seq_dict = getSeqforIgBlast(seq_file)

    # Formalize out_dir and file-prefix
    if not out_args['out_dir']:
        out_dir = os.path.split(igblast_output)[0]
    else:
        out_dir = os.path.abspath(out_args['out_dir'])
        if not os.path.exists(out_dir):  os.mkdir(out_dir)
    if out_args['out_name']:
        file_prefix = out_args['out_name']
    else:
        file_prefix = os.path.basename(os.path.splitext(igblast_output)[0])
    file_prefix = os.path.join(out_dir, file_prefix)

    total_count = countSeqFile(seq_file)

    # Create
    repo_dict = getRepo(repo)
    igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict,
                               score_fields=score_fields, region_fields=region_fields)
    writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse,
            score_fields=score_fields, region_fields=region_fields, out_args=out_args)
def modifyHeaders(seq_file, modify_func, modify_args, out_args=default_out_args):
    """
    Modifies sequence headers

    Arguments: 
    seq_file = the sequence file name
    modify_func = the function defining the modification operation
    modify_args = a dictionary of arguments to pass to modify_func
    out_args = common output argument dictionary from parseCommonArgs
                    
    Returns: 
    the output file name
    """
    # Define subcommand label dictionary
    cmd_dict = {addHeader: 'add',
                copyHeader: 'copy',
                collapseHeader: 'collapse',
                deleteHeader: 'delete',
                expandHeader: 'expand',
                renameHeader: 'rename'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'ParseHeaders'
    log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__)
    log['FILE'] = os.path.basename(seq_file)
    for k in sorted(modify_args):  
        v = modify_args[k]
        log[k.upper()] = ','.join(v) if isinstance(v, list) else v
    printLog(log)
    
    # Open file handles
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type
    out_handle = getOutputHandle(seq_file, 'reheader', out_dir=out_args['out_dir'],
                                 out_name=out_args['out_name'], out_type=out_args['out_type'])

    # Count records
    result_count = countSeqFile(seq_file)
    
    # Iterate over sequences
    start_time = time()
    seq_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, result_count, 0.05, start_time)
        
        #Update counts
        seq_count += 1
        
        # Modify header
        header = parseAnnotation(seq.description, delimiter=out_args['delimiter'])
        header = modify_func(header, delimiter=out_args['delimiter'], **modify_args)
        
        # Write new sequence
        seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter'])
        seq.description = ''
        SeqIO.write(seq, out_handle, out_args['out_type'])
        
    # print counts
    printProgress(seq_count, result_count, 0.05, start_time)    
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['SEQUENCES'] = seq_count
    log['END'] = 'ParseHeaders'               
    printLog(log)

    # Close file handles
    out_handle.close()
 
    return out_handle.name
Esempio n. 4
0
def downsizeSeqFile(seq_file, max_count, out_args=default_out_args):
    """
    Splits a FASTA/FASTQ file into segments with a limited number of records

    Arguments: 
      seq_file : filename of the FASTA file to split
      max_count : number of records in each output file
      out_args : common output argument dictionary from parseCommonArgs

    Returns: 
      list: output file names
    """
    log = OrderedDict()
    log['START'] = 'SplitSeq'
    log['COMMAND'] = 'count'
    log['FILE'] = os.path.basename(seq_file)
    log['MAX_COUNT'] = max_count
    printLog(log)

    # Open file handles
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None: out_args['out_type'] = in_type
    # Determine total numbers of records
    rec_count = countSeqFile(seq_file)

    # Loop through iterator writing each record and opening new output handle as needed
    start_time = time()
    seq_count, part_num = 0, 1
    out_handle = getOutputHandle(seq_file,
                                 'part%06i' % part_num,
                                 out_dir=out_args['out_dir'],
                                 out_name=out_args['out_name'],
                                 out_type=out_args['out_type'])
    out_files = [out_handle.name]
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, rec_count, 0.05, start_time=start_time)

        # Update count
        seq_count += 1

        # Write records
        SeqIO.write(seq, out_handle, out_args['out_type'])
        # Break if total records reached to avoid extra empty file
        if seq_count == rec_count:
            break

        # Open new file if needed
        if seq_count % max_count == 0:
            out_handle.close()
            part_num += 1
            out_handle = getOutputHandle(seq_file,
                                         'part%06i' % part_num,
                                         out_dir=out_args['out_dir'],
                                         out_name=out_args['out_name'],
                                         out_type=out_args['out_type'])
            out_files.append(out_handle.name)

    # Print log
    printProgress(seq_count, rec_count, 0.05, start_time=start_time)
    log = OrderedDict()
    for i, f in enumerate(out_files):
        log['OUTPUT%i' % (i + 1)] = os.path.basename(f)
    log['SEQUENCES'] = rec_count
    log['PARTS'] = len(out_files)
    log['END'] = 'SplitSeq'
    printLog(log)

    # Close file handles
    out_handle.close()

    return out_files
Esempio n. 5
0
def groupSeqFile(seq_file, field, threshold=None, out_args=default_out_args):
    """
    Divides a sequence file into segments by description tags

    Arguments: 
      seq_file : filename of the sequence file to split
      field : The annotation field to split seq_file by
      threshold : The numerical threshold for group sequences by;
                  if None treat field as textual
      out_args : common output argument dictionary from parseCommonArgs

    Returns: 
      list: output file names
    """
    log = OrderedDict()
    log['START'] = 'SplitSeq'
    log['COMMAND'] = 'group'
    log['FILE'] = os.path.basename(seq_file)
    log['FIELD'] = field
    log['THRESHOLD'] = threshold
    printLog(log)

    # Open file handles
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None: out_args['out_type'] = in_type

    # Determine total numbers of records
    rec_count = countSeqFile(seq_file)

    # Process sequences
    start_time = time()
    seq_count = 0
    if threshold is None:
        # Sort records into files based on textual field
        # Create set of unique field tags
        temp_iter = readSeqFile(seq_file)
        tag_list = getAnnotationValues(temp_iter,
                                       field,
                                       unique=True,
                                       delimiter=out_args['delimiter'])

        if sys.platform != 'win32':
            import resource
            # Increase open file handle limit if needed
            file_limit = resource.getrlimit(resource.RLIMIT_NOFILE)[0]
            file_count = len(tag_list) + 256
            if file_limit < file_count and file_count <= 8192:
                #print file_limit, file_count
                resource.setrlimit(resource.RLIMIT_NOFILE,
                                   (file_count, file_count))
            elif file_count > 8192:
                e = '''OS file limit would need to be set to %i.
                    If you are sure you want to do this, then increase the 
                    file limit in the OS (via ulimit) and rerun this tool.
                    ''' % file_count
                printError(dedent(e))

        # Create output handles
        # out_label = '%s=%s' % (field, tag)
        handles_dict = {
            tag: getOutputHandle(seq_file,
                                 '%s-%s' % (field, tag),
                                 out_dir=out_args['out_dir'],
                                 out_name=out_args['out_name'],
                                 out_type=out_args['out_type'])
            for tag in tag_list
        }

        # Iterate over sequences
        for seq in seq_iter:
            printProgress(seq_count, rec_count, 0.05, start_time=start_time)
            seq_count += 1
            # Write sequences
            tag = parseAnnotation(seq.description,
                                  delimiter=out_args['delimiter'])[field]
            SeqIO.write(seq, handles_dict[tag], out_args['out_type'])
    else:
        # Sort records into files based on numeric threshold
        threshold = float(threshold)
        # Create output handles
        handles_dict = {
            'under':
            getOutputHandle(seq_file,
                            'under-%.1g' % threshold,
                            out_dir=out_args['out_dir'],
                            out_name=out_args['out_name'],
                            out_type=out_args['out_type']),
            'atleast':
            getOutputHandle(seq_file,
                            'atleast-%.1g' % threshold,
                            out_dir=out_args['out_dir'],
                            out_name=out_args['out_name'],
                            out_type=out_args['out_type'])
        }

        # Iterate over sequences
        for seq in seq_iter:
            printProgress(seq_count, rec_count, 0.05, start_time=start_time)
            seq_count += 1
            # Write sequences
            tag = parseAnnotation(seq.description,
                                  delimiter=out_args['delimiter'])[field]
            tag = 'under' if float(tag) < threshold else 'atleast'
            SeqIO.write(seq, handles_dict[tag], out_args['out_type'])

    # Print log
    printProgress(seq_count, rec_count, 0.05, start_time=start_time)
    log = OrderedDict()
    for i, k in enumerate(handles_dict):
        log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name)
    log['SEQUENCES'] = rec_count
    log['PARTS'] = len(handles_dict)
    log['END'] = 'SplitSeq'
    printLog(log)

    # Close output file handles
    for k in handles_dict:
        handles_dict[k].close()

    return [handles_dict[k].name for k in handles_dict]
Esempio n. 6
0
def collectPairQueue(alive,
                     result_queue,
                     collect_queue,
                     seq_file_1,
                     seq_file_2,
                     label,
                     out_file=None,
                     out_args=default_out_args):
    """
    Pulls from results queue, assembles results and manages log and file IO

    Arguments:
      alive : a multiprocessing.Value boolean controlling whether processing
              continues; when False function returns.
      result_queue : a multiprocessing.Queue holding worker results.
      collect_queue : a multiprocessing.Queue holding collector return values.
      seq_file_1 : the first sequence file name.
      seq_file_2 : the second sequence file name.
      label : task label used to tag the output files.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      None: adds a dictionary of {log: log object, out_files: output file names} to collect_queue.
    """
    # Define output format
    out_type = getFileType(seq_file_1) if out_args['out_type'] is None \
        else out_args['out_type']

    # Define output names
    if out_args['out_name'] is None:
        out_name_1, out_name_2 = None, None
    else:
        out_name_1 = '%s-1' % out_args['out_name']
        out_name_2 = '%s-2' % out_args['out_name']

    # Wrapper for opening handles and writers
    def _open(x, in_file, out_name, out_file=out_file):
        if out_file is not None and x == 'pass':
            handle = open(out_file, 'w')
        else:
            handle = getOutputHandle(in_file,
                                     out_label='%s-%s' % (label, x),
                                     out_dir=out_args['out_dir'],
                                     out_name=out_name,
                                     out_type=out_type)
        return handle

    try:
        # Count input size
        result_count = countSeqFile(seq_file_1)

        # Define log handle
        if out_args['log_file'] is None:
            log_handle = None
        else:
            log_handle = open(out_args['log_file'], 'w')
    except:
        alive.value = False
        raise

    try:
        # Initialize file handles
        pass_handle, fail_handle_1, fail_handle_2 = None, None, None

        # Iterator over results queue until sentinel object reached
        start_time = time()
        iter_count = pass_count = fail_count = 0
        while alive.value:
            # Get result from queue
            if result_queue.empty():
                continue
            else:
                result = result_queue.get()
            # Exit upon reaching sentinel
            if result is None: break

            # Print progress for previous iteration
            printProgress(iter_count,
                          result_count,
                          0.05,
                          start_time=start_time)

            # Update counts for iteration
            iter_count += 1

            # Write log
            printLog(result.log, handle=log_handle)

            # Write assembled sequences
            if result:
                pass_count += 1
                try:
                    SeqIO.write(result.results, pass_handle, out_type)
                except AttributeError:
                    # Open pass file
                    pass_handle = _open('pass', seq_file_1,
                                        out_args['out_name'])
                    SeqIO.write(result.results, pass_handle, out_type)
            else:
                fail_count += 1
                if out_args['failed']:
                    try:
                        SeqIO.write(result.data[0], fail_handle_1, out_type)
                        SeqIO.write(result.data[1], fail_handle_2, out_type)
                    except AttributeError:
                        # Open fail file
                        fail_handle_1 = _open('fail', seq_file_1, out_name_1)
                        fail_handle_2 = _open('fail', seq_file_2, out_name_2)
                        SeqIO.write(result.data[0], fail_handle_1, out_type)
                        SeqIO.write(result.data[1], fail_handle_2, out_type)

        else:
            sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None

        # Print total counts
        printProgress(iter_count, result_count, 0.05, start_time=start_time)

        # Update return values
        log = OrderedDict()
        log['OUTPUT'] = os.path.basename(
            pass_handle.name) if pass_handle is not None else None
        log['PAIRS'] = iter_count
        log['PASS'] = pass_count
        log['FAIL'] = fail_count

        # Close file handles and generate return data
        # collect_dict = {'log': log, 'pass': None, 'fail': None}
        collect_dict = {'log': log, 'out_files': []}
        if pass_handle is not None:
            # collect_dict['pass'] = pass_handle.name
            collect_dict['out_files'].append(pass_handle.name)
            pass_handle.close()
        if fail_handle_1 is not None:
            # collect_dict['fail'] = fail_handle.name
            collect_dict['out_files'].append(fail_handle_1.name)
            fail_handle_1.close()
        if fail_handle_2 is not None:
            # collect_dict['fail'] = fail_handle.name
            collect_dict['out_files'].append(fail_handle_2.name)
            fail_handle_2.close()
        if log_handle is not None:
            log_handle.close()
        collect_queue.put(collect_dict)
    except:
        alive.value = False
        raise

    return None
Esempio n. 7
0
def collectSeqQueue(alive,
                    result_queue,
                    collect_queue,
                    seq_file,
                    label,
                    index_field=None,
                    out_file=None,
                    out_args=default_out_args):
    """
    Pulls from results queue, assembles results and manages log and file IO

    Arguments:
      alive : a multiprocessing.Value boolean controlling whether processing
              continues; when False function returns.
      result_queue : Multiprocessing.Queue holding worker results.
      collect_queue : Multiprocessing.Queue to store collector return values.
      seq_file : sample sequence file name.
      label : task label used to tag the output files.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : Common output argument dictionary from parseCommonArgs.
      index_field : Field defining set membership for sequence sets
                    if None data queue contained individual records.

    Returns:
      None: Adds a dictionary with key value pairs to collect_queue containing
           'log' defining a log object,
           'out_files' defining the output file names
    """
    # Define output format
    out_type = getFileType(seq_file) if out_args['out_type'] is None \
               else out_args['out_type']

    # Wrapper for opening handles and writers
    def _open(x, label=label, out_file=out_file):
        if out_file is not None and x == 'pass':
            handle = open(out_file, 'w')
        else:
            handle = getOutputHandle(seq_file,
                                     out_label='%s-%s' % (label, x),
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type=out_type)
        return handle

    try:
        # Count records
        if index_field is None:
            result_count = countSeqFile(seq_file)
        else:
            result_count = countSeqSets(seq_file, index_field,
                                        out_args['delimiter'])

        # Define log handle
        if out_args['log_file'] is None:
            log_handle = None
        else:
            log_handle = open(out_args['log_file'], 'w')
    except:
        alive.value = False
        raise

    try:
        # Initialize output handles
        pass_handle, fail_handle = None, None

        # Iterator over results queue until sentinel object reached
        start_time = time()
        set_count = seq_count = pass_count = fail_count = 0
        while alive.value:
            # Get result from queue
            if result_queue.empty(): continue
            else: result = result_queue.get()
            # Exit upon reaching sentinel
            if result is None: break

            # Print progress for previous iteration
            printProgress(set_count, result_count, 0.05, start_time=start_time)

            # Update counts for current iteration
            set_count += 1
            seq_count += result.data_count

            # Write log
            printLog(result.log, handle=log_handle)

            # Write records
            if result:
                pass_count += 1
                try:
                    SeqIO.write(result.results, pass_handle, out_type)
                except AttributeError:
                    # Open pass file
                    pass_handle = _open('pass')
                    SeqIO.write(result.results, pass_handle, out_type)
            else:
                fail_count += 1
                if out_args['failed']:
                    try:
                        SeqIO.write(result.data, fail_handle, out_type)
                    except AttributeError:
                        # Open fail file
                        fail_handle = _open('fail')
                        SeqIO.write(result.data, fail_handle, out_type)
        else:
            sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None

        # Print total counts
        printProgress(set_count, result_count, 0.05, start_time=start_time)

        # Update return values
        log = OrderedDict()
        log['OUTPUT'] = os.path.basename(
            pass_handle.name) if pass_handle is not None else None
        log['SEQUENCES'] = seq_count
        if index_field is not None:
            log['SETS'] = set_count
        log['PASS'] = pass_count
        log['FAIL'] = fail_count

        # Close file handles and generate return data
        #collect_dict = {'log': log, 'pass': None, 'fail': None}
        collect_dict = {'log': log, 'out_files': []}
        if pass_handle is not None:
            #collect_dict['pass'] = pass_handle.name
            collect_dict['out_files'].append(pass_handle.name)
            pass_handle.close()
        if fail_handle is not None:
            #collect_dict['fail'] = fail_handle.name
            collect_dict['out_files'].append(fail_handle.name)
            fail_handle.close()
        if log_handle is not None:
            log_handle.close()
        collect_queue.put(collect_dict)
    except:
        alive.value = False
        raise

    return None
Esempio n. 8
0
def parseIgBLAST(aligner_file, seq_file, repo, amino_acid=False, cellranger_file=None, partial=False,
                 asis_id=True, asis_calls=False, extended=False, regions='default',
                 format='changeo', out_file=None, out_args=default_out_args):
    """
    Main for IgBLAST aligned sample sequences.

    Arguments:
      aligner_file (str): IgBLAST output file to process.
      seq_file (str): fasta file input to IgBlast (from which to get sequence).
      repo (str): folder with germline repertoire files.
      amino_acid (bool): if True then the IgBLAST output files are results from igblastp. igblastn is assumed if False.
      partial : If True put incomplete alignments in the pass file.
      asis_id (bool): if ID is to be parsed for pRESTO output with default delimiters.
      asis_calls (bool): if True do not parse gene calls for allele names.
      extended (bool): if True add alignment scores, FWR regions, and CDR regions to the output.
      regions (str): name of the IMGT FWR/CDR region definitions to use.
      format (str): output format. one of 'changeo' or 'airr'.
      out_file (str): output file name. Automatically generated from the input file if None.
      out_args (dict): common output argument dictionary from parseCommonArgs.

    Returns:
      dict : names of the 'pass' and 'fail' output files.
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'MakeDB'
    log['COMMAND'] = 'igblast-aa' if amino_acid else 'igblast'
    log['ALIGNER_FILE'] = os.path.basename(aligner_file)
    log['SEQ_FILE'] = os.path.basename(seq_file)
    log['ASIS_ID'] = asis_id
    log['ASIS_CALLS'] = asis_calls
    log['PARTIAL'] = partial
    log['EXTENDED'] = extended
    printLog(log)

    # Set amino acid conditions
    if amino_acid:
        format = '%s-aa' % format
        parser = IgBLASTReaderAA
    else:
        parser = IgBLASTReader

    # Start
    start_time = time()
    printMessage('Loading files', start_time=start_time, width=20)

    # Count records in sequence file
    total_count = countSeqFile(seq_file)

    # Get input sequence dictionary
    seq_dict = getSeqDict(seq_file)

    # Create germline repo dictionary
    references = readGermlines(repo, asis=asis_calls)

    # Load supplementary annotation table
    if cellranger_file is not None:
        f = cellranger_extended if extended else cellranger_base
        annotations = readCellRanger(cellranger_file, fields=f)
    else:
        annotations = None

    printMessage('Done', start_time=start_time, end=True, width=20)

    # Check for IMGT-gaps in germlines
    if all('...' not in x for x in references.values()):
        printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')

    # Define format operators
    try:
        __, writer, schema = getFormatOperators(format)
    except ValueError:
        printError('Invalid format %s.' % format)
    out_args['out_type'] = schema.out_type

    # Define output fields
    fields = list(schema.required)
    if extended:
        custom = parser.customFields(schema=schema)
        fields.extend(custom)

    # Parse and write output
    with open(aligner_file, 'r') as f:
        parse_iter = parser(f, seq_dict, references, regions=regions, asis_calls=asis_calls)
        germ_iter = (addGermline(x, references, amino_acid=amino_acid) for x in parse_iter)
        output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, 
                         annotations=annotations, amino_acid=amino_acid, partial=partial, asis_id=asis_id,
                         regions=regions, writer=writer, out_file=out_file, out_args=out_args)

    return output
Esempio n. 9
0
def pairSeq(seq_file_1, seq_file_2, fields_1=None, fields_2=None,
            coord_type=default_coord_type,
            out_args=default_out_args):
    """
    Generates consensus sequences

    Arguments: 
    seq_file_1 = the file containing the grouped sequences and annotations
    seq_file_2 = the file to assign annotations to from seq_file_1
    fields_1 = list of annotations in seq_file_1 records to copy to seq_file_2 records;
               if None do not copy any annotations
    fields_2 = list of annotations in seq_file_2 records to copy to seq_file_1 records;
               if None do not copy any annotations
    coord_type = the sequence header format
    out_args = common output argument dictionary from parseCommonArgs
                    
    Returns: 
    a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2)
    """
    # Define private functions
    def _key_func(x):
        return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter'])

    log = OrderedDict()
    log['START'] = 'PairSeq'
    log['FILE1'] = os.path.basename(seq_file_1)
    log['FILE2'] = os.path.basename(seq_file_2)
    log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None
    log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None
    log['COORD_TYPE'] = coord_type
    printLog(log)

    # Define output type
    if out_args['out_type'] is None:
        out_type_1 = getFileType(seq_file_1)
        out_type_2 = getFileType(seq_file_2)
    else: 
        out_type_1 = out_type_2 = out_args['out_type']

    # Define output name
    if out_args['out_name'] is None:
        out_name_1 = out_name_2 = None
    else: 
        out_name_1 = '%s-1' % out_args['out_name']
        out_name_2 = '%s-2' % out_args['out_name']

    # Open and count files
    start_time = time()
    printMessage("Indexing files", start_time=start_time)
    # Index file 1
    seq_count_1 = countSeqFile(seq_file_1)
    seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func)
    # Define file 2 iterator
    seq_count_2 = countSeqFile(seq_file_2)
    seq_iter_2 = readSeqFile(seq_file_2, index=False)
    printMessage("Done", start_time=start_time, end=True)

    # Open output file handles
    pass_handle_1 = getOutputHandle(seq_file_1, 'pair-pass', out_args['out_dir'], 
                                    out_name=out_name_1, out_type=out_type_1)
    pass_handle_2 = getOutputHandle(seq_file_2, 'pair-pass', out_args['out_dir'], 
                                    out_name=out_name_2, out_type=out_type_2)

    if out_args['failed']:
        fail_handle_1 = getOutputHandle(seq_file_1, 'pair-fail', out_dir=out_args['out_dir'],
                                        out_name=out_name_1, out_type=out_type_1)
        fail_handle_2 = getOutputHandle(seq_file_2, 'pair-fail', out_dir=out_args['out_dir'],
                                        out_name=out_name_2, out_type=out_type_2)
        pass_keys = list()

    # Iterate over pairs and write to output files
    start_time = time()
    rec_count = pair_count = 0
    for seq_2 in seq_iter_2:
        # Print progress for previous iteration
        printProgress(rec_count, seq_count_2, 0.05, start_time)
        rec_count += 1

        # Check for file 2 mate pair in file 1
        coord_2 = getCoordKey(seq_2.id, coord_type=coord_type,
                              delimiter=out_args['delimiter'])
        seq_1 = seq_dict_1.get(coord_2, None)

        if seq_1 is not None:
            # Record paired keys
            pair_count += 1

            if fields_1 is not None or fields_2 is not None:
                ann_1 = parseAnnotation(seq_1.description,
                                        delimiter=out_args['delimiter'])
                ann_2 = parseAnnotation(seq_2.description,
                                        delimiter=out_args['delimiter'])

                # Prepend annotations from seq_1 to seq_2
                if fields_1 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \
                                            if k in fields_1])
                    merge_ann = mergeAnnotation(ann_2, copy_ann, prepend=True,
                                                delimiter=out_args['delimiter'])
                    seq_2.id = flattenAnnotation(merge_ann,
                                                 delimiter=out_args['delimiter'])
                    seq_2.description = ''

                # Append annotations from seq_2 to seq_1
                if fields_2 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \
                                            if k in fields_2])
                    merge_ann = mergeAnnotation(ann_1, copy_ann, prepend=False,
                                                delimiter=out_args['delimiter'])
                    seq_1.id = flattenAnnotation(merge_ann,
                                                 delimiter=out_args['delimiter'])
                    seq_1.description = ''

            # Write paired records
            SeqIO.write(seq_1, pass_handle_1, out_type_1)
            SeqIO.write(seq_2, pass_handle_2, out_type_2)

        # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records
        if out_args['failed']:
            if seq_1 is not None:  pass_keys.append(coord_2)
            else:  SeqIO.write(seq_2, fail_handle_2, out_type_2)

    # Print final progress
    printProgress(rec_count, seq_count_2, 0.05, start_time)

    # Find and write unpaired file 1 records
    if out_args['failed']:
        start_time = time()
        printMessage("Finding unpaired", start_time=start_time)

        # Find file 1 unpaired keys
        pass_keys = set(pass_keys)
        unpaired = set(seq_dict_1).difference(pass_keys)
        # Write unpaired file 1 records
        for k in unpaired:  SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1)

        printMessage("Done", start_time=start_time, end=True)

    # Print log
    log = OrderedDict()
    log['OUTPUT1'] = os.path.basename(pass_handle_1.name)
    log['OUTPUT2'] = os.path.basename(pass_handle_2.name)
    log['SEQUENCES1'] = seq_count_1
    log['SEQUENCES2'] = seq_count_2
    log['PASS'] = pair_count
    log['END'] = 'PairSeq'
    printLog(log)
   
    # Close file handles
    pass_handle_1.close()
    pass_handle_2.close()

    return [(pass_handle_1.name, pass_handle_2.name)]
Esempio n. 10
0
def convertHeaders(seq_file, convert_func, convert_args={}, out_file=None, out_args=default_out_args):
    """
    Converts sequence headers to the pRESTO format

    Arguments:
      seq_file : the sequence file name.
      convert_func : the function used to convert sequence headers.
      convert_args : a dictionary of arguments to pass to convert_func.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      str: the output sequence file name.
    """
    # Define subcommand label dictionary
    cmd_dict = {convertGenericHeader: 'generic',
                convert454Header: '454',
                convertGenbankHeader: 'genbank',
                convertIlluminaHeader: 'illumina',
                convertIMGTHeader: 'imgt',
                convertMIGECHeader: 'migec',
                convertSRAHeader: 'sra'}

    log = OrderedDict()
    log['START'] = 'ConvertHeaders'
    log['COMMAND'] = cmd_dict[convert_func]
    log['FILE'] = os.path.basename(seq_file)
    printLog(log)

    # Open input file
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type

    # Wrapper for opening handles and writers
    def _open(x, out_file=out_file):
        if out_file is not None and x == 'pass':
            handle = open(out_file, 'w')
        else:
            handle = getOutputHandle(seq_file,
                                     'convert-%s' % x,
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type=out_args['out_type'])
        return handle

    # Count records
    result_count = countSeqFile(seq_file)

    # Set additional conversion arguments
    if convert_func in [convertGenericHeader, convertGenbankHeader]:
        convert_args.update({'delimiter': out_args['delimiter']})

    # Intialize file handles
    pass_handle, fail_handle = None, None

    # Iterate over sequences
    start_time = time()
    seq_count = pass_count = fail_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration and update count
        printProgress(seq_count, result_count, 0.05, start_time=start_time)
        seq_count += 1

        # Convert header
        header = convert_func(seq.description, **convert_args)

        if header is not None:
            # Write successfully converted sequences
            pass_count += 1
            seq.id = seq.name = flattenAnnotation(header, out_args['delimiter'])
            seq.description = ''
            try:
                SeqIO.write(seq, pass_handle, out_args['out_type'])
            except AttributeError:
                # Open output file
                pass_handle = _open('pass')
                SeqIO.write(seq, pass_handle, out_args['out_type'])
        else:
            fail_count += 1
            if out_args['failed']:
                # Write unconverted sequences
                try:
                    SeqIO.write(seq, fail_handle, out_args['out_type'])
                except AttributeError:
                    # Open output file
                    pass_handle = _open('fail')
                    SeqIO.write(seq, fail_handle, out_args['out_type'])

    # Print counts
    printProgress(seq_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None
    log['SEQUENCES'] = seq_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'ConvertHeaders'
    printLog(log)

    # Close file handles
    if fail_handle is not None:  pass_handle.close()
    if fail_handle is not None:  fail_handle.close()

    return pass_handle.name
Esempio n. 11
0
def estimateBarcode(seq_file,
                    barcode_field=default_barcode_field,
                    distance_types=default_distance_types,
                    out_args=default_out_args):
    """
    Calculates error rates of barcode sequences

    Arguments: 
      seq_file : the sample sequence file name
      barcode_field : the annotation field containing barcode sequences.
      distance_types : distance types to include.
      out_args : common output argument dictionary from parseCommonArgs
                        
    Returns: 
      tuple: names of the output files.
    """

    # Function to extract to make SeqRecord object from a barcode annotation
    def _barcode(seq, field=barcode_field, delimiter=out_args['delimiter']):
        header = parseAnnotation(seq.description, delimiter=delimiter)
        return header[field]

    # Print parameter info
    log = OrderedDict()
    log['START'] = 'EstimateError'
    log['COMMAND'] = 'barcode'
    log['FILE'] = os.path.basename(seq_file)
    log['BARCODE_FIELD'] = barcode_field
    printLog(log)

    # Count sequence file and parse into a list of SeqRecords
    result_count = countSeqFile(seq_file)
    barcode_iter = (_barcode(x) for x in readSeqFile(seq_file))

    # Compute bin_count defaults to the length of the barcode + 1
    bin_count = len(_barcode(next(readSeqFile(seq_file)))) + 1
    mismatch = initializeMismatchDictionary(0,
                                            distance_types=distance_types,
                                            bin_count=bin_count)

    # Calculate distances
    distance_mismatch = calculateDistances(barcode_iter, bin_count=bin_count)
    mismatch['dist'] = {
        header: distance_mismatch[header]
        for header in distance_types
    }

    # Generate a df
    dist_df = pd.DataFrame.from_dict(mismatch['dist'])
    dist_df.index = dist_df.index / len(dist_df.index)
    dist_df[['all']] = dist_df[['all']].astype(int)

    #find the threshold (average minimum between 0 and 0.75)
    dist = mismatch['dist']['all']
    thresh_df = pd.DataFrame.from_dict({'thresh': {'ALL': dist_df.index[int(np.mean([index for index in np.argsort(dist[:int(len(dist)*0.75)]) \
                                                                                     if dist[index] == np.min(dist)]))]}
                                        })
    file_args = {
        'out_dir': out_args['out_dir'],
        'out_name': out_args['out_name'],
        'out_type': 'tab'
    }

    # Output as tsv
    with getOutputHandle(seq_file, 'distance-barcode', **file_args) as dist_handle, \
        getOutputHandle(seq_file, 'threshold-barcode', **file_args) as thresh_handle:

        dist_df.to_csv(dist_handle,
                       sep='\t',
                       na_rep='NA',
                       index=True,
                       index_label='DISTANCE',
                       columns=['all'],
                       header=['ALL'],
                       float_format='%.6f')
        thresh_df.to_csv(thresh_handle,
                         sep='\t',
                         na_rep='NA',
                         index=True,
                         index_label='TYPE',
                         columns=['thresh'],
                         header=['THRESHOLD'],
                         float_format='%.6f')

    # Update log
    log['OUTPUT1'] = os.path.basename(dist_handle.name)
    log['OUTPUT2'] = os.path.basename(thresh_handle.name)
    log['SEQUENCES'] = result_count
    log['ALL_THRESHOLD'] = '%.6f' % thresh_df['thresh']['ALL']
    log['END'] = 'EstimateError'
    printLog(log)

    return (dist_handle.name, thresh_handle.name)
def assemblePairs(head_file, tail_file, assemble_func, assemble_args={}, 
                  coord_type=default_coord_type, rc=None, 
                  head_fields=None, tail_fields=None,  
                  out_args=default_out_args, nproc=None, queue_size=None):
    """
    Generates consensus sequences

    Arguments: 
    head_file = the head sequence file name
    tail_file = the tail sequence file name
    assemble_func = the function to use to assemble paired ends
    assemble_args = a dictionary of arguments to pass to the assembly function
    coord_type = the sequence header format
    rc = Defines which sequences ('head','tail','both') to reverse complement before assembly;
         if None do not reverse complement sequences
    head_fields = list of annotations in head_file records to copy to assembled record;
                  if None do not copy an annotation
    tail_fields = list of annotations in tail_file records to copy to assembled record;
                  if None do not copy an annotation
    out_args = common output argument dictionary from parseCommonArgs
    nproc = the number of processQueue processes;
            if None defaults to the number of CPUs
    queue_size = maximum size of the argument queue;
                 if None defaults to 2*nproc
                 
    Returns: 
    a list of successful output file names
    """
    # Define subcommand label dictionary
    cmd_dict = {alignAssembly:'align', joinSeqPair:'join', referenceAssembly:'reference'}

    # Print parameter info
    log = OrderedDict()
    log['START'] = 'AssemblePairs'
    log['COMMAND'] = cmd_dict.get(assemble_func, assemble_func.__name__)
    log['FILE1'] = os.path.basename(head_file) 
    log['FILE2'] = os.path.basename(tail_file)
    log['COORD_TYPE'] = coord_type
    if 'ref_file' in assemble_args:  log['REFFILE'] = assemble_args['ref_file']
    if 'alpha' in assemble_args:  log['ALPHA'] = assemble_args['alpha']
    if 'max_error' in assemble_args:  log['MAX_ERROR'] = assemble_args['max_error']
    if 'min_len' in assemble_args:  log['MIN_LEN'] = assemble_args['min_len']
    if 'max_len' in assemble_args:  log['MAX_LEN'] = assemble_args['max_len']
    if 'scan_reverse' in assemble_args:  log['SCAN_REVERSE'] = assemble_args['scan_reverse']
    if 'gap' in assemble_args:  log['GAP'] = assemble_args['gap']
    if 'min_ident' in assemble_args:  log['MIN_IDENT'] = assemble_args['min_ident']
    if 'evalue' in assemble_args:  log['EVALUE'] = assemble_args['evalue']
    if 'max_hits' in assemble_args:  log['MAX_HITS'] = assemble_args['max_hits']
    if 'fill' in assemble_args:  log['FILL'] = assemble_args['fill']
    log['NPROC'] = nproc
    printLog(log)

    # Count input files
    head_count = countSeqFile(head_file)
    tail_count = countSeqFile(tail_file)
    if head_count != tail_count:
        sys.exit('Error: FILE1 (n=%i) and FILE2 (n=%i) must have the same number of records' \
                 % (head_count, tail_count))

    # Define feeder function and arguments
    feed_func = feedPairQueue
    # feed_args = {'seq_file_1': head_file,
    #              'seq_file_2': tail_file,
    #              'index_dict': index_dict}
    feed_args = {'seq_file_1': head_file,
                 'seq_file_2': tail_file,
                 'coord_type': coord_type,
                 'delimiter': out_args['delimiter']}
    # Define worker function and arguments
    process_args = {'assemble_func': assemble_func,
                    'assemble_args': assemble_args,
                    'rc': rc,
                    'fields_1': head_fields,
                    'fields_2': tail_fields,
                    'delimiter': out_args['delimiter']}
    work_func = processSeqQueue
    work_args = {'process_func': processAssembly,
                 'process_args': process_args}
    # Define collector function and arguments
    collect_func = collectPairQueue
    # collect_args = {'result_count': pair_count,
    #                 'seq_file_1': head_file,
    #                 'seq_file_2': tail_file,
    #                 'out_args': out_args}
    collect_args = {'result_count': head_count,
                    'seq_file_1': head_file,
                    'seq_file_2': tail_file,
                    'out_args': out_args}

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, 
                             feed_args, work_args, collect_args, 
                             nproc, queue_size)
        
    # Print log
    log = OrderedDict()
    log['OUTPUT'] = result['log'].pop('OUTPUT')
    for k, v in result['log'].items():  log[k] = v
    log['END'] = 'AssemblePairs'
    printLog(log)
    
    return result['out_files']
Esempio n. 13
0
def assemblePairs(head_file,
                  tail_file,
                  assemble_func,
                  assemble_args={},
                  coord_type=default_coord,
                  rc='tail',
                  head_fields=None,
                  tail_fields=None,
                  out_file=None,
                  out_args=default_out_args,
                  nproc=None,
                  queue_size=None):
    """
    Generates consensus sequences

    Arguments: 
      head_file : the head sequence file name
      tail_file : the tail sequence file name
      assemble_func : the function to use to assemble paired ends
      assemble_args : a dictionary of arguments to pass to the assembly function
      coord_type : the sequence header format
      rc : Defines which sequences ('head', 'tail', 'both', 'none') to reverse complement before assembly;
           if 'none' do not reverse complement sequences
      head_fields : list of annotations in head_file records to copy to assembled record;
                    if None do not copy an annotation
      tail_fields : list of annotations in tail_file records to copy to assembled record;
                    if None do not copy an annotation
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs
      nproc = the number of processQueue processes;
              if None defaults to the number of CPUs
      queue_size = maximum size of the argument queue;
                   if None defaults to 2*nproc
                 
    Returns: 
      list: a list of successful output file names.
    """
    # Define subcommand label dictionary
    cmd_dict = {
        alignAssembly: 'align',
        joinAssembly: 'join',
        referenceAssembly: 'reference',
        sequentialAssembly: 'sequential'
    }
    cmd_name = cmd_dict.get(assemble_func, assemble_func.__name__)

    # Print parameter info
    log = OrderedDict()
    log['START'] = 'AssemblePairs'
    log['COMMAND'] = cmd_name
    log['FILE1'] = os.path.basename(head_file)
    log['FILE2'] = os.path.basename(tail_file)
    log['COORD_TYPE'] = coord_type
    if 'ref_file' in assemble_args: log['REFFILE'] = assemble_args['ref_file']
    if 'alpha' in assemble_args: log['ALPHA'] = assemble_args['alpha']
    if 'max_error' in assemble_args:
        log['MAX_ERROR'] = assemble_args['max_error']
    if 'min_len' in assemble_args: log['MIN_LEN'] = assemble_args['min_len']
    if 'max_len' in assemble_args: log['MAX_LEN'] = assemble_args['max_len']
    if 'scan_reverse' in assemble_args:
        log['SCAN_REVERSE'] = assemble_args['scan_reverse']
    if 'gap' in assemble_args: log['GAP'] = assemble_args['gap']
    if 'min_ident' in assemble_args:
        log['MIN_IDENT'] = assemble_args['min_ident']
    if 'evalue' in assemble_args: log['EVALUE'] = assemble_args['evalue']
    if 'max_hits' in assemble_args: log['MAX_HITS'] = assemble_args['max_hits']
    if 'fill' in assemble_args: log['FILL'] = assemble_args['fill']
    if 'aligner' in assemble_args: log['ALIGNER'] = assemble_args['aligner']
    log['NPROC'] = nproc
    printLog(log)

    # Count input files
    head_count = countSeqFile(head_file)
    tail_count = countSeqFile(tail_file)
    if head_count != tail_count:
        printError('FILE1 (n=%i) and FILE2 (n=%i) must have the same number of records.' \
                 % (head_count, tail_count))

    # Setup for reference alignment
    if cmd_name in ('reference', 'sequential'):
        ref_file = assemble_args.pop('ref_file')
        db_exec = assemble_args.pop('db_exec')

        # Build reference sequence dictionary
        assemble_args['ref_dict'] = readReferenceFile(ref_file)

        # Build reference database files
        try:
            db_func = {
                'blastn': makeBlastnDb,
                'usearch': makeUBlastDb
            }[assemble_args['aligner']]
            ref_db, db_handle = db_func(ref_file, db_exec)
            assemble_args['ref_db'] = ref_db
        except:
            printError('Error building reference database for aligner %s with executable %s.' \
                       % (assemble_args['aligner'], db_exec))

    # Define feeder function and arguments
    feed_func = feedPairQueue
    feed_args = {
        'seq_file_1': head_file,
        'seq_file_2': tail_file,
        'coord_type': coord_type,
        'delimiter': out_args['delimiter']
    }
    # Define worker function and arguments
    process_args = {
        'assemble_func': assemble_func,
        'assemble_args': assemble_args,
        'rc': rc,
        'fields_1': head_fields,
        'fields_2': tail_fields,
        'delimiter': out_args['delimiter']
    }
    work_func = processSeqQueue
    work_args = {'process_func': assemblyWorker, 'process_args': process_args}
    # Define collector function and arguments
    collect_func = collectPairQueue
    collect_args = {
        'seq_file_1': head_file,
        'seq_file_2': tail_file,
        'label': 'assemble',
        'out_file': out_file,
        'out_args': out_args
    }

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, feed_args,
                             work_args, collect_args, nproc, queue_size)

    # Close reference database handle
    if cmd_name in ('reference', 'sequential'):
        try:
            db_handle.close()
        except AttributeError:
            db_handle.cleanup()
        except:
            printError('Cannot close reference database file.')

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = result['log'].pop('OUTPUT')
    for k, v in result['log'].items():
        log[k] = v
    log['END'] = 'AssemblePairs'
    printLog(log)

    return result['out_files']
Esempio n. 14
0
def clusterBarcodes(seq_file,
                    ident=default_cluster_ident,
                    length_ratio=default_length_ratio,
                    barcode_field=default_barcode_field,
                    cluster_field=default_cluster_field,
                    cluster_prefix=default_cluster_prefix,
                    cluster_tool=default_cluster_tool,
                    cluster_exec=default_cluster_exec,
                    out_file=None,
                    out_args=default_out_args,
                    nproc=None):
    """
    Performs clustering on sets of sequences

    Arguments:
      seq_file : the sample sequence file name.
      ident : the identity threshold for clustering sequences.
      length_ratio : minimum short/long length ratio allowed within a cluster.
      barcode_field : the annotation field containing barcode sequences.
      cluster_field : the name of the output cluster field.
      cluster_prefix : string defining a prefix for the cluster identifier.
      seq_start : the start position to trim sequences at before clustering.
      seq_end : the end position to trim sequences at before clustering.
      cluster_tool : the clustering tool to use; one of cd-hit or usearch.
      cluster_exec : the path to the executable for usearch.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : output arguments.
      nproc : the number of processQueue processes;
              if None defaults to the number of CPUs.

    Returns:
      str: the clustered output file name
    """

    # Function to modify SeqRecord header with cluster identifier
    def _header(seq,
                cluster,
                field=cluster_field,
                prefix=cluster_prefix,
                delimiter=out_args['delimiter']):
        label = '%s%i' % (prefix, cluster)
        header = parseAnnotation(seq.description, delimiter=delimiter)
        header = mergeAnnotation(header, {field: label}, delimiter=delimiter)
        seq.id = seq.name = flattenAnnotation(header, delimiter=delimiter)
        seq.description = ''
        return seq

    # Function to extract to make SeqRecord object from a barcode annotation
    def _barcode(seq, field=barcode_field, delimiter=out_args['delimiter']):
        header = parseAnnotation(seq.description, delimiter=delimiter)
        return SeqRecord(Seq(header[field]), id=seq.id)

    # Print parameter info
    log = OrderedDict()
    log['START'] = 'ClusterSets'
    log['COMMAND'] = 'barcode'
    log['FILE'] = os.path.basename(seq_file)
    log['IDENTITY'] = ident
    log['BARCODE_FIELD'] = barcode_field
    log['CLUSTER_FIELD'] = cluster_field
    log['CLUSTER_PREFIX'] = cluster_prefix
    log['CLUSTER_TOOL'] = cluster_tool
    log['NPROC'] = nproc
    printLog(log)

    # Set cluster tool
    try:
        cluster_func = map_cluster_tool.get(cluster_tool)
    except:
        printError('Invalid clustering tool %s.' % cluster_tool)

    # Check the minimum identity
    if ident < min_cluster_ident[cluster_tool]:
        printError('Minimum identity %s too low for clustering tool %s.' %
                   (str(ident), cluster_tool))

    # Count sequence file and parse into a list of SeqRecords
    result_count = countSeqFile(seq_file)
    barcode_iter = (_barcode(x) for x in readSeqFile(seq_file))

    # Perform clustering
    start_time = time()
    printMessage('Running %s' % cluster_tool, start_time=start_time, width=25)
    cluster_dict = cluster_func(barcode_iter,
                                ident=ident,
                                length_ratio=length_ratio,
                                seq_start=0,
                                seq_end=None,
                                threads=nproc,
                                cluster_exec=cluster_exec)
    printMessage('Done', start_time=start_time, end=True, width=25)

    # Determine file type
    if out_args['out_type'] is None:
        out_args['out_type'] = getFileType(seq_file)

    # Open output file handles
    if out_file is not None:
        pass_handle = open(out_file, 'w')
    else:
        pass_handle = getOutputHandle(seq_file,
                                      'cluster-pass',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])

    # Open indexed sequence file
    seq_dict = readSeqFile(seq_file, index=True)

    # Iterate over sequence records and update header with cluster annotation
    start_time = time()
    rec_count = pass_count = 0
    for cluster, id_list in cluster_dict.items():
        printProgress(rec_count, result_count, 0.05, start_time=start_time)
        rec_count += len(id_list)

        # TODO:  make a generator. Figure out how to get pass_count updated
        # Define output sequences
        seq_output = [_header(seq_dict[x], cluster) for x in id_list]

        # Write output
        pass_count += len(seq_output)
        SeqIO.write(seq_output, pass_handle, out_args['out_type'])

    # Update progress
    printProgress(rec_count, result_count, 0.05, start_time=start_time)

    # Print log
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['CLUSTERS'] = len(cluster_dict)
    log['SEQUENCES'] = result_count
    log['PASS'] = pass_count
    log['FAIL'] = rec_count - pass_count
    log['END'] = 'ClusterSets'
    printLog(log)

    # Close handles
    pass_handle.close()

    return pass_handle.name
Esempio n. 15
0
def parseIHMM(aligner_file, seq_file, repo, cellranger_file=None, partial=False, asis_id=True,
              extended=False, format=default_format, out_file=None, out_args=default_out_args):
    """
    Main for iHMMuneAlign aligned sample sequences.

    Arguments:
      aligner_file : iHMMune-Align output file to process.
      seq_file : fasta file input to iHMMuneAlign (from which to get sequence).
      repo : folder with germline repertoire files.
      partial : If True put incomplete alignments in the pass file.
      asis_id : if ID is to be parsed for pRESTO output with default delimiters.
      extended : if True parse alignment scores, FWR and CDR region fields.
      format : output format. One of 'changeo' or 'airr'.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      dict : names of the 'pass' and 'fail' output files.
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'MakeDB'
    log['COMMAND'] = 'ihmm'
    log['ALIGNER_FILE'] = os.path.basename(aligner_file)
    log['SEQ_FILE'] = os.path.basename(seq_file)
    log['ASIS_ID'] = asis_id
    log['PARTIAL'] = partial
    log['EXTENDED'] = extended
    printLog(log)

    start_time = time()
    printMessage('Loading files', start_time=start_time, width=20)

    # Count records in sequence file
    total_count = countSeqFile(seq_file)

    # Get input sequence dictionary
    seq_dict = getSeqDict(seq_file)

    # Create germline repo dictionary
    references = readGermlines(repo)

    # Load supplementary annotation table
    if cellranger_file is not None:
        f = cellranger_extended if extended else cellranger_base
        annotations = readCellRanger(cellranger_file, fields=f)
    else:
        annotations = None

    printMessage('Done', start_time=start_time, end=True, width=20)

    # Check for IMGT-gaps in germlines
    if all('...' not in x for x in references.values()):
        printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')

    # Define format operators
    try:
        __, writer, schema = getFormatOperators(format)
    except ValueError:
        printError('Invalid format %s.' % format)
    out_args['out_type'] = schema.out_type

    # Define output fields
    fields = list(schema.required)
    if extended:
        custom = IHMMuneReader.customFields(scores=True, regions=True, schema=schema)
        fields.extend(custom)

    # Parse and write output
    with open(aligner_file, 'r') as f:
        parse_iter = IHMMuneReader(f, seq_dict, references)
        germ_iter = (addGermline(x, references) for x in parse_iter)
        output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, 
                        annotations=annotations, asis_id=asis_id, partial=partial,
                        writer=writer, out_file=out_file, out_args=out_args)

    return output
def tableHeaders(seq_file, fields, out_args=default_out_args):
    """
    Builds a table of sequence header annotations

    Arguments: 
    seq_file = the sequence file name
    fields = the list of fields to output
    out_args = common output argument dictionary from parseCommonArgs
                    
    Returns: 
    the output table file name
    """
    log = OrderedDict()
    log['START'] = 'ParseHeaders'
    log['COMMAND'] = 'table'
    log['FILE'] = os.path.basename(seq_file)
    printLog(log)
    
    # Open file handles
    seq_iter = readSeqFile(seq_file)
    out_handle = getOutputHandle(seq_file, out_label='headers', out_dir=out_args['out_dir'], 
                                 out_name=out_args['out_name'], out_type='tab')
    # Count records
    result_count = countSeqFile(seq_file)
    
    # Open csv writer and write header
    out_writer = csv.DictWriter(out_handle, extrasaction='ignore', restval='', 
                                delimiter='\t', fieldnames=fields)
    out_writer.writeheader()
    
    # Iterate over sequences
    start_time = time()
    seq_count = pass_count = fail_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, result_count, 0.05, start_time)
        
        # Get annotations
        seq_count += 1
        ann = parseAnnotation(seq.description, fields, delimiter=out_args['delimiter'])

        # Write records
        if ann:
            pass_count += 1
            out_writer.writerow(ann)
        else:
            fail_count += 1
        
    # Print counts
    printProgress(seq_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['SEQUENCES'] = seq_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'ParseHeaders'
    printLog(log)

    # Close file handles
    out_handle.close()
 
    return out_handle.name
Esempio n. 17
0
def tableHeaders(seq_file, fields, out_file=None, out_args=default_out_args):
    """
    Builds a table of sequence header annotations

    Arguments: 
      seq_file : the sequence file name.
      fields : the list of fields to output.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
                    
    Returns: 
      str: output table file name
    """
    log = OrderedDict()
    log['START'] = 'ParseHeaders'
    log['COMMAND'] = 'table'
    log['FILE'] = os.path.basename(seq_file)
    printLog(log)
    
    # Open file handles
    seq_iter = readSeqFile(seq_file)
    if out_file is not None:
        out_handle = open(out_file, 'w')
    else:
        out_handle = getOutputHandle(seq_file,
                                     'headers',
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type='tab')
    # Count records
    result_count = countSeqFile(seq_file)
    
    # Open csv writer and write header
    out_writer = csv.DictWriter(out_handle, extrasaction='ignore', restval='', 
                                delimiter='\t', fieldnames=fields)
    out_writer.writeheader()
    
    # Iterate over sequences
    start_time = time()
    seq_count = pass_count = fail_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, result_count, 0.05, start_time=start_time)
        
        # Get annotations
        seq_count += 1
        ann = parseAnnotation(seq.description, fields, delimiter=out_args['delimiter'])

        # Write records
        if ann:
            pass_count += 1
            out_writer.writerow(ann)
        else:
            fail_count += 1
        
    # Print counts
    printProgress(seq_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['SEQUENCES'] = seq_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'ParseHeaders'
    printLog(log)

    # Close file handles
    out_handle.close()
 
    return out_handle.name
Esempio n. 18
0
def modifyHeaders(seq_file, modify_func, modify_args, out_file=None, out_args=default_out_args):
    """
    Modifies sequence headers

    Arguments: 
      seq_file : the sequence file name.
      modify_func : the function defining the modification operation.
      modify_args : a dictionary of arguments to pass to modify_func.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
                    
    Returns: 
      str: output file name.
    """
    # Define subcommand label dictionary
    cmd_dict = {addHeader: 'add',
                copyHeader: 'copy',
                collapseHeader: 'collapse',
                deleteHeader: 'delete',
                expandHeader: 'expand',
                renameHeader: 'rename'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'ParseHeaders'
    log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__)
    log['FILE'] = os.path.basename(seq_file)
    for k in sorted(modify_args):  
        v = modify_args[k]
        log[k.upper()] = ','.join(v) if isinstance(v, list) else v
    printLog(log)
    
    # Open file handles
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type
    if out_file is not None:
        out_handle = open(out_file, 'w')
    else:
        out_handle = getOutputHandle(seq_file,
                                     'reheader',
                                     out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'],
                                     out_type=out_args['out_type'])
    # Count records
    result_count = countSeqFile(seq_file)

    # Iterate over sequences
    start_time = time()
    seq_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration
        printProgress(seq_count, result_count, 0.05, start_time=start_time)
        
        #Update counts
        seq_count += 1
        
        # Modify header
        header = parseAnnotation(seq.description, delimiter=out_args['delimiter'])
        header = modify_func(header, delimiter=out_args['delimiter'], **modify_args)
        
        # Write new sequence
        seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter'])
        seq.description = ''
        SeqIO.write(seq, out_handle, out_args['out_type'])
        
    # print counts
    printProgress(seq_count, result_count, 0.05, start_time=start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(out_handle.name)
    log['SEQUENCES'] = seq_count
    log['END'] = 'ParseHeaders'               
    printLog(log)

    # Close file handles
    out_handle.close()
 
    return out_handle.name
def convertHeaders(seq_file, convert_func, convert_args={}, out_args=default_out_args):
    """
    Converts sequence headers to the pRESTO format

    Arguments:
    seq_file = the sequence file name
    convert_func = the function used to convert sequence headers
    convert_args = a dictionary of arguments to pass to convert_func
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    the output sequence file name
    """
    # Define subcommand label dictionary
    cmd_dict = {convertGenericHeader:'generic',
                convert454Header:'454',
                convertGenbankHeader:'genbank',
                convertIlluminaHeader:'illumina',
                convertIMGTHeader:'imgt',
                convertSRAHeader:'sra'}

    log = OrderedDict()
    log['START'] = 'ConvertHeaders'
    log['COMMAND'] = cmd_dict[convert_func]
    log['FILE'] = os.path.basename(seq_file)
    printLog(log)

    # Open input file
    in_type = getFileType(seq_file)
    seq_iter = readSeqFile(seq_file)
    if out_args['out_type'] is None:  out_args['out_type'] = in_type

    # Count records
    result_count = countSeqFile(seq_file)

    # Open output file handles
    pass_handle = getOutputHandle(seq_file,
                                  'convert-pass',
                                  out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'],
                                  out_type=out_args['out_type'])
    if out_args['failed']:
        fail_handle = getOutputHandle(seq_file,
                                      'convert-fail',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
    else:
        fail_handle = None

    # Set additional conversion arguments
    if convert_func in [convertGenericHeader, convertGenbankHeader]:
        convert_args.update({'delimiter':out_args['delimiter']})

    # Iterate over sequences
    start_time = time()
    seq_count = pass_count = fail_count = 0
    for seq in seq_iter:
        # Print progress for previous iteration and update count
        printProgress(seq_count, result_count, 0.05, start_time)
        seq_count += 1

        # Convert header
        header = convert_func(seq.description, **convert_args)

        if header is not None:
            # Write successfully converted sequences
            pass_count += 1
            seq.id = seq.name = flattenAnnotation(header, out_args['delimiter'])
            seq.description = ''
            SeqIO.write(seq, pass_handle, out_args['out_type'])
        else:
            fail_count += 1
            if fail_handle is not None:
                # Write successfully unconverted sequences
                SeqIO.write(seq, fail_handle, out_args['out_type'])

    # Print counts
    printProgress(seq_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['SEQUENCES'] = seq_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'ConvertHeaders'
    printLog(log)

    # Close file handles
    pass_handle.close()
    if fail_handle is not None:  fail_handle.close()

    return pass_handle.name
Esempio n. 20
0
def pairSeq(seq_file_1,
            seq_file_2,
            fields_1=None,
            fields_2=None,
            action=None,
            coord_type=default_coord,
            out_args=default_out_args):
    """
    Syncronized paired end files and copies annotations between them

    Arguments: 
      seq_file_1 : the file containing the grouped sequences and annotations.
      seq_file_2 : the file to assign annotations to from seq_file_1.
      fields_1 : list of annotations in seq_file_1 records to copy to seq_file_2 records;
                 if None do not copy any annotations.
      fields_2 : list of annotations in seq_file_2 records to copy to seq_file_1 records;
                 if None do not copy any annotations.
      action : the collapse action to take on all copied annotation if they already exist in the
               target header.
      coord_type : the sequence header format.
      out_args : common output argument dictionary from parseCommonArgs.
                    
    Returns: 
      list: a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2).
    """

    # Define private functions
    def _key_func(x):
        return getCoordKey(x,
                           coord_type=coord_type,
                           delimiter=out_args['delimiter'])

    log = OrderedDict()
    log['START'] = 'PairSeq'
    log['FILE1'] = os.path.basename(seq_file_1)
    log['FILE2'] = os.path.basename(seq_file_2)
    log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None
    log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None
    log['COORD_TYPE'] = coord_type
    printLog(log)

    # Define output type
    if out_args['out_type'] is None:
        out_type_1 = getFileType(seq_file_1)
        out_type_2 = getFileType(seq_file_2)
    else:
        out_type_1 = out_type_2 = out_args['out_type']

    # Define output name
    if out_args['out_name'] is None:
        out_name_1 = out_name_2 = None
    else:
        out_name_1 = '%s-1' % out_args['out_name']
        out_name_2 = '%s-2' % out_args['out_name']

    # Open and count files
    start_time = time()
    printMessage("Indexing files", start_time=start_time)
    # Index file 1
    seq_count_1 = countSeqFile(seq_file_1)
    seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func)
    # Define file 2 iterator
    seq_count_2 = countSeqFile(seq_file_2)
    seq_iter_2 = readSeqFile(seq_file_2, index=False)
    printMessage("Done", start_time=start_time, end=True)

    # Open output file handles
    pass_handle_1 = getOutputHandle(seq_file_1,
                                    'pair-pass',
                                    out_args['out_dir'],
                                    out_name=out_name_1,
                                    out_type=out_type_1)
    pass_handle_2 = getOutputHandle(seq_file_2,
                                    'pair-pass',
                                    out_args['out_dir'],
                                    out_name=out_name_2,
                                    out_type=out_type_2)

    if out_args['failed']:
        fail_handle_1 = getOutputHandle(seq_file_1,
                                        'pair-fail',
                                        out_dir=out_args['out_dir'],
                                        out_name=out_name_1,
                                        out_type=out_type_1)
        fail_handle_2 = getOutputHandle(seq_file_2,
                                        'pair-fail',
                                        out_dir=out_args['out_dir'],
                                        out_name=out_name_2,
                                        out_type=out_type_2)
        pass_keys = list()

    # Iterate over pairs and write to output files
    start_time = time()
    rec_count = pair_count = 0
    for seq_2 in seq_iter_2:
        # Print progress for previous iteration
        printProgress(rec_count, seq_count_2, 0.05, start_time=start_time)
        rec_count += 1

        # Check for file 2 mate pair in file 1
        coord_2 = getCoordKey(seq_2.id,
                              coord_type=coord_type,
                              delimiter=out_args['delimiter'])
        seq_1 = seq_dict_1.get(coord_2, None)

        if seq_1 is not None:
            # Record paired keys
            pair_count += 1

            if fields_1 is not None or fields_2 is not None:
                ann_1 = parseAnnotation(seq_1.description,
                                        delimiter=out_args['delimiter'])
                ann_2 = parseAnnotation(seq_2.description,
                                        delimiter=out_args['delimiter'])

                # Prepend annotations from seq_1 to seq_2
                if fields_1 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \
                                            if k in fields_1])
                    merge_ann = mergeAnnotation(
                        ann_2,
                        copy_ann,
                        prepend=True,
                        delimiter=out_args['delimiter'])
                    # Collapse if necessary
                    if action is not None:
                        merge_ann = collapseAnnotation(
                            merge_ann,
                            action,
                            fields=fields_1,
                            delimiter=out_args['delimiter'])
                    # Flatten
                    seq_2.id = flattenAnnotation(
                        merge_ann, delimiter=out_args['delimiter'])
                    seq_2.description = ''

                # Append annotations from seq_2 to seq_1
                if fields_2 is not None:
                    copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \
                                            if k in fields_2])
                    merge_ann = mergeAnnotation(
                        ann_1,
                        copy_ann,
                        prepend=False,
                        delimiter=out_args['delimiter'])
                    # Collapse if necessary
                    if action is not None:
                        merge_ann = collapseAnnotation(
                            merge_ann,
                            action,
                            fields=fields_2,
                            delimiter=out_args['delimiter'])
                    # Flatten
                    seq_1.id = flattenAnnotation(
                        merge_ann, delimiter=out_args['delimiter'])
                    seq_1.description = ''

            # Write paired records
            SeqIO.write(seq_1, pass_handle_1, out_type_1)
            SeqIO.write(seq_2, pass_handle_2, out_type_2)

        # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records
        if out_args['failed']:
            if seq_1 is not None: pass_keys.append(coord_2)
            else: SeqIO.write(seq_2, fail_handle_2, out_type_2)

    # Print final progress
    printProgress(rec_count, seq_count_2, 0.05, start_time=start_time)

    # Find and write unpaired file 1 records
    if out_args['failed']:
        start_time = time()
        printMessage("Finding unpaired", start_time=start_time)

        # Find file 1 unpaired keys
        pass_keys = set(pass_keys)
        unpaired = set(seq_dict_1).difference(pass_keys)
        # Write unpaired file 1 records
        for k in unpaired:
            SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1)

        printMessage("Done", start_time=start_time, end=True)

    # Print log
    log = OrderedDict()
    log['OUTPUT1'] = os.path.basename(pass_handle_1.name)
    log['OUTPUT2'] = os.path.basename(pass_handle_2.name)
    log['SEQUENCES1'] = seq_count_1
    log['SEQUENCES2'] = seq_count_2
    log['PASS'] = pair_count
    log['END'] = 'PairSeq'
    printLog(log)

    # Close file handles
    pass_handle_1.close()
    pass_handle_2.close()

    return [(pass_handle_1.name, pass_handle_2.name)]