def addDbFile(db_file, fields, values, out_args=default_out_args):
    """
    Adds field and value pairs to a database file

    Arguments:
    db_file = the database file name
    fields = a list of fields to add
    values = a list of values to assign to all rows of each field
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    the output file name
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'add'
    log['FILE'] = os.path.basename(db_file)
    log['FIELDS'] = ','.join(fields)
    log['VALUES'] = ','.join(values)
    printLog(log)

    # Open file handles
    db_iter = readDbFile(db_file, ig=False)
    pass_handle = getOutputHandle(db_file, out_label='parse-add', out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'], out_type='tab')
    pass_writer = getDbWriter(pass_handle, db_file, add_fields=fields)
    # Count records
    result_count = countDbFile(db_file)

    # Define fields and values to append
    add_dict = {k:v for k,v in zip(fields, values) if k not in db_iter.fieldnames}

    # Iterate over records
    start_time = time()
    rec_count = 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time)
        rec_count += 1
        # Write updated row
        rec.update(add_dict)
        pass_writer.writerow(rec)

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()

    return pass_handle.name
Example #2
0
def indexDbFile(db_file, field=default_index_field, out_args=default_out_args):
    """
    Adds an index column to a database file

    Arguments:
    db_file = the database file name
    field = the name of the index field to add
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    the output file name
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'index'
    log['FILE'] = os.path.basename(db_file)
    log['FIELD'] = field
    printLog(log)

    # Open file handles
    db_iter = readDbFile(db_file, ig=False)
    pass_handle = getOutputHandle(db_file,
                                  out_label='parse-index',
                                  out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'],
                                  out_type='tab')
    pass_writer = getDbWriter(pass_handle, db_file, add_fields=field)
    # Count records
    result_count = countDbFile(db_file)

    # Iterate over records
    start_time = time()
    rec_count = 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time)
        rec_count += 1

        # Add count and write updated row
        rec.update({field: rec_count})
        pass_writer.writerow(rec)

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()

    return pass_handle.name
Example #3
0
def dropDbFile(db_file, fields, out_args=default_out_args):
    """
    Deletes entire fields from a database file

    Arguments:
    db_file = the database file name
    fields = a list of fields to drop
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    the output file name
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'add'
    log['FILE'] = os.path.basename(db_file)
    log['FIELDS'] = ','.join(fields)
    printLog(log)

    # Open file handles
    db_iter = readDbFile(db_file, ig=False)
    pass_handle = getOutputHandle(db_file,
                                  out_label='parse-drop',
                                  out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'],
                                  out_type='tab')
    pass_writer = getDbWriter(pass_handle, db_file, exclude_fields=fields)
    # Count records
    result_count = countDbFile(db_file)

    # Iterate over records
    start_time = time()
    rec_count = 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time)
        rec_count += 1
        # Write row
        pass_writer.writerow(rec)

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()

    return pass_handle.name
def indexDbFile(db_file, field=default_index_field, out_args=default_out_args):
    """
    Adds an index column to a database file

    Arguments:
    db_file = the database file name
    field = the name of the index field to add
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    the output file name
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'index'
    log['FILE'] = os.path.basename(db_file)
    log['FIELD'] = field
    printLog(log)

    # Open file handles
    db_iter = readDbFile(db_file, ig=False)
    pass_handle = getOutputHandle(db_file, out_label='parse-index', out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'], out_type='tab')
    pass_writer = getDbWriter(pass_handle, db_file, add_fields=field)
    # Count records
    result_count = countDbFile(db_file)

    # Iterate over records
    start_time = time()
    rec_count = 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time)
        rec_count += 1

        # Add count and write updated row
        rec.update({field:rec_count})
        pass_writer.writerow(rec)

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()

    return pass_handle.name
def dropDbFile(db_file, fields, out_args=default_out_args):
    """
    Deletes entire fields from a database file

    Arguments:
    db_file = the database file name
    fields = a list of fields to drop
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    the output file name
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'add'
    log['FILE'] = os.path.basename(db_file)
    log['FIELDS'] = ','.join(fields)
    printLog(log)

    # Open file handles
    db_iter = readDbFile(db_file, ig=False)
    pass_handle = getOutputHandle(db_file, out_label='parse-drop', out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'], out_type='tab')
    pass_writer = getDbWriter(pass_handle, db_file, exclude_fields=fields)
    # Count records
    result_count = countDbFile(db_file)

    # Iterate over records
    start_time = time()
    rec_count = 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time)
        rec_count += 1
        # Write row
        pass_writer.writerow(rec)

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()

    return pass_handle.name
def collectQueue(alive, result_queue, collect_queue, db_file, out_args, cluster_func=None, cluster_args={}):
    """
    Assembles results from a queue of individual sequence results and manages log/file I/O

    Arguments: 
    alive = a multiprocessing.Value boolean controlling whether processing continues
            if False exit process
    result_queue = a multiprocessing.Queue holding processQueue results
    collect_queue = a multiprocessing.Queue to store collector return values
    db_file = the input database file name
    out_args = common output argument dictionary from parseCommonArgs
    cluster_func = the function to call for carrying out clustering on distance matrix
    cluster_args = a dictionary of arguments to pass to cluster_func
    
    Returns: 
    None
    (adds 'log' and 'out_files' to collect_dict)
    """
    # Open output files
    try:
        # Count records and define output format 
        out_type = getFileType(db_file) if out_args['out_type'] is None \
                   else out_args['out_type']
        result_count = countDbFile(db_file)
        
        # Defined successful output handle
        pass_handle = getOutputHandle(db_file, 
                                      out_label='clone-pass', 
                                      out_dir=out_args['out_dir'], 
                                      out_name=out_args['out_name'], 
                                      out_type=out_type)
        pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE')
        
        # Defined failed alignment output handle
        if out_args['failed']:
            fail_handle = getOutputHandle(db_file,
                                          out_label='clone-fail', 
                                          out_dir=out_args['out_dir'], 
                                          out_name=out_args['out_name'], 
                                          out_type=out_type)
            fail_writer = getDbWriter(fail_handle, db_file)
        else:
            fail_handle = None
            fail_writer = None

        # Define log handle
        if out_args['log_file'] is None:  
            log_handle = None
        else:  
            log_handle = open(out_args['log_file'], 'w')
    except:
        #sys.stderr.write('Exception in collector file opening step\n')
        alive.value = False
        raise

    # Get results from queue and write to files
    try:
        #print 'START COLLECT', alive.value
        # Iterator over results queue until sentinel object reached
        start_time = time()
        rec_count = clone_count = pass_count = fail_count = 0
        while alive.value:
            # Get result from queue
            if result_queue.empty():  continue
            else:  result = result_queue.get()
            # Exit upon reaching sentinel
            if result is None:  break
            #print "COLLECT", alive.value, result['id']
            
            # Print progress for previous iteration and update record count
            if rec_count == 0:
                print('PROGRESS> Assigning clones')
            printProgress(rec_count, result_count, 0.05, start_time) 
            rec_count += len(result.data)
            
            # Write passed and failed records
            if result:
                for clone in result.results.values():
                    clone_count += 1
                    for i, rec in enumerate(clone):
                        rec.annotations['CLONE'] = clone_count
                        pass_writer.writerow(rec.toDict())
                        pass_count += 1
                        result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction)
    
            else:
                for i, rec in enumerate(result.data):
                    if fail_writer is not None: fail_writer.writerow(rec.toDict())
                    fail_count += 1
                    result.log['CLONE0-%i' % (i + 1)] = str(rec.junction)
                    
            # Write log
            printLog(result.log, handle=log_handle)
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None
        
        # Print total counts
        printProgress(rec_count, result_count, 0.05, start_time)

        # Close file handles
        pass_handle.close()
        if fail_handle is not None:  fail_handle.close()
        if log_handle is not None:  log_handle.close()
                
        # Update return list
        log = OrderedDict()
        log['OUTPUT'] = os.path.basename(pass_handle.name)
        log['CLONES'] = clone_count
        log['RECORDS'] = rec_count
        log['PASS'] = pass_count
        log['FAIL'] = fail_count
        collect_dict = {'log':log, 'out_files': [pass_handle.name]}
        collect_queue.put(collect_dict)
    except:
        #sys.stderr.write('Exception in collector result processing step\n')
        alive.value = False
        raise

    return None
Example #7
0
def updateDbFile(db_file, field, values, updates, out_args=default_out_args):
    """
    Updates field and value pairs to a database file

    Arguments:
    db_file = the database file name
    field = the field to update
    values = a list of values to specifying which rows to update
    updates = a list of values to update each value with
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    the output file name
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'update'
    log['FILE'] = os.path.basename(db_file)
    log['FIELD'] = field
    log['VALUES'] = ','.join(values)
    log['UPDATES'] = ','.join(updates)
    printLog(log)

    # Open file handles
    db_iter = readDbFile(db_file, ig=False)
    pass_handle = getOutputHandle(db_file,
                                  out_label='parse-update',
                                  out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'],
                                  out_type='tab')
    pass_writer = getDbWriter(pass_handle, db_file)
    # Count records
    result_count = countDbFile(db_file)

    # Iterate over records
    start_time = time()
    rec_count = pass_count = 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time)
        rec_count += 1

        # Updated values if found
        for x, y in zip(values, updates):
            if rec[field] == x:
                rec[field] = y
                pass_count += 1

        # Write records
        pass_writer.writerow(rec)

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['UPDATED'] = pass_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()

    return pass_handle.name
Example #8
0
def sortDbFile(db_file,
               field,
               numeric=False,
               descend=False,
               out_args=default_out_args):
    """
    Sorts records by values in an annotation field

    Arguments:
    db_file = the database filename
    field = the field name to sort by
    numeric = if True sort field numerically;
              if False sort field alphabetically
    descend = if True sort in descending order;
              if False sort in ascending order

    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    the output file name
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'sort'
    log['FILE'] = os.path.basename(db_file)
    log['FIELD'] = field
    log['NUMERIC'] = numeric
    printLog(log)

    # Open file handles
    db_iter = readDbFile(db_file, ig=False)
    pass_handle = getOutputHandle(db_file,
                                  out_label='parse-sort',
                                  out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'],
                                  out_type='tab')
    pass_writer = getDbWriter(pass_handle, db_file)

    # Store all records in a dictionary
    start_time = time()
    printMessage("Indexing: Running", start_time=start_time)
    db_dict = {i: r for i, r in enumerate(db_iter)}
    result_count = len(db_dict)

    # Sort db_dict by field values
    tag_dict = {k: v[field] for k, v in db_dict.items()}
    if numeric: tag_dict = {k: float(v or 0) for k, v in tag_dict.items()}
    sorted_keys = sorted(tag_dict, key=tag_dict.get, reverse=descend)
    printMessage("Indexing: Done", start_time=start_time, end=True)

    # Iterate over records
    start_time = time()
    rec_count = 0
    for key in sorted_keys:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time)
        rec_count += 1

        # Write records
        pass_writer.writerow(db_dict[key])

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()

    return pass_handle.name
Example #9
0
def splitDbFile(db_file, field, num_split=None, out_args=default_out_args):
    """
    Divides a tab-delimited database file into segments by description tags

    Arguments:
    db_file = filename of the tab-delimited database file to split
    field = the field name by which to split db_file
    num_split = the numerical threshold by which to group sequences;
                if None treat field as textual
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    a list of output file names
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'split'
    log['FILE'] = os.path.basename(db_file)
    log['FIELD'] = field
    log['NUM_SPLIT'] = num_split
    printLog(log)

    # Open IgRecord reader iter object
    reader = readDbFile(db_file, ig=False)

    # Determine total numbers of records
    rec_count = countDbFile(db_file)

    start_time = time()
    count = 0
    # Sort records into files based on textual field
    if num_split is None:
        # Create set of unique field tags
        tmp_iter = readDbFile(db_file, ig=False)
        tag_list = list(set([row[field] for row in tmp_iter]))

        # Forbidden characters in filename and replacements
        noGood = {
            '\/': 'f',
            '\\': 'b',
            '?': 'q',
            '\%': 'p',
            '*': 's',
            ':': 'c',
            '\|': 'pi',
            '\"': 'dq',
            '\'': 'sq',
            '<': 'gt',
            '>': 'lt',
            ' ': '_'
        }
        # Replace forbidden characters in tag_list
        tag_dict = {}
        for tag in tag_list:
            for c, r in noGood.items():
                tag_dict[tag] = (tag_dict.get(tag, tag).replace(c,r) \
                                     if c in tag else tag_dict.get(tag, tag))

        # Create output handles
        handles_dict = {
            tag: getOutputHandle(db_file,
                                 '%s-%s' % (field, label),
                                 out_type=out_args['out_type'],
                                 out_name=out_args['out_name'],
                                 out_dir=out_args['out_dir'])
            for tag, label in tag_dict.items()
        }

        # Create Db writer instances
        writers_dict = {
            tag: getDbWriter(handles_dict[tag], db_file)
            for tag in tag_dict
        }

        # Iterate over IgRecords
        for row in reader:
            printProgress(count, rec_count, 0.05, start_time)
            count += 1
            # Write row to appropriate file
            tag = row[field]
            writers_dict[tag].writerow(row)

    # Sort records into files based on numeric num_split
    else:
        num_split = float(num_split)

        # Create output handles
        handles_dict = {
            'under':
            getOutputHandle(db_file,
                            'under-%.1f' % num_split,
                            out_type=out_args['out_type'],
                            out_name=out_args['out_name'],
                            out_dir=out_args['out_dir']),
            'atleast':
            getOutputHandle(db_file,
                            'atleast-%.1f' % num_split,
                            out_type=out_args['out_type'],
                            out_name=out_args['out_name'],
                            out_dir=out_args['out_dir'])
        }

        # Create Db writer instances
        writers_dict = {
            'under': getDbWriter(handles_dict['under'], db_file),
            'atleast': getDbWriter(handles_dict['atleast'], db_file)
        }

        # Iterate over IgRecords
        for row in reader:
            printProgress(count, rec_count, 0.05, start_time)
            count += 1
            tag = row[field]
            tag = 'under' if float(tag) < num_split else 'atleast'
            writers_dict[tag].writerow(row)

    # Write log
    printProgress(count, rec_count, 0.05, start_time)
    log = OrderedDict()
    for i, k in enumerate(handles_dict):
        log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name)
    log['RECORDS'] = rec_count
    log['PARTS'] = len(handles_dict)
    log['END'] = 'ParseDb'
    printLog(log)

    # Close output file handles
    for t in handles_dict:
        handles_dict[t].close()

    return [handles_dict[t].name for t in handles_dict]
Example #10
0
def collectQueueClust(alive, result_queue, collect_queue, db_file, out_args,
                      cluster_func, cluster_args):
    """
    Assembles results from a queue of individual sequence results and manages log/file I/O

    Arguments: 
    alive = a multiprocessing.Value boolean controlling whether processing continues
            if False exit process
    result_queue = a multiprocessing.Queue holding processQueue results
    collect_queue = a multiprocessing.Queue to store collector return values
    db_file = the input database file name
    out_args = common output argument dictionary from parseCommonArgs
    cluster_func = the function to call for carrying out clustering on distance matrix
    cluster_args = a dictionary of arguments to pass to cluster_func
    
    Returns: 
    None
    (adds 'log' and 'out_files' to collect_dict)
    """
    # Open output files
    try:

        # Iterate over Ig records to count and order by junction length
        result_count = 0
        records = {}
        # print 'Reading file...'
        db_iter = readDbFile(db_file)
        for rec in db_iter:
            records[rec.id] = rec
            result_count += 1
        records = OrderedDict(
            sorted(list(records.items()), key=lambda i: i[1].junction_length))

        # Define empty matrix to store assembled results
        dist_mat = np.zeros((result_count, result_count))

        # Count records and define output format
        out_type = getFileType(db_file) if out_args['out_type'] is None \
                   else out_args['out_type']

        # Defined successful output handle
        pass_handle = getOutputHandle(db_file,
                                      out_label='clone-pass',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_type)
        pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE')

        # Defined failed cloning output handle
        if out_args['failed']:
            fail_handle = getOutputHandle(db_file,
                                          out_label='clone-fail',
                                          out_dir=out_args['out_dir'],
                                          out_name=out_args['out_name'],
                                          out_type=out_type)
            fail_writer = getDbWriter(fail_handle, db_file)
        else:
            fail_handle = None
            fail_writer = None

        # Open log file
        if out_args['log_file'] is None:
            log_handle = None
        else:
            log_handle = open(out_args['log_file'], 'w')
    except:
        alive.value = False
        raise

    try:
        # Iterator over results queue until sentinel object reached
        start_time = time()
        row_count = rec_count = 0
        while alive.value:
            # Get result from queue
            if result_queue.empty(): continue
            else: result = result_queue.get()
            # Exit upon reaching sentinel
            if result is None: break

            # Print progress for previous iteration
            if row_count == 0:
                print('PROGRESS> Assigning clones')
            printProgress(row_count, result_count, 0.05, start_time)

            # Update counts for iteration
            row_count += 1
            rec_count += len(result)

            # Add result row to distance matrix
            if result:
                dist_mat[list(range(result_count - len(result), result_count)),
                         result_count - len(result)] = result.results

        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None

        # Calculate linkage and carry out clustering
        # print dist_mat
        clusters = cluster_func(dist_mat, **
                                cluster_args) if dist_mat is not None else None
        clones = {}
        # print clusters
        for i, c in enumerate(clusters):
            clones.setdefault(c, []).append(records[list(records.keys())[i]])

        # Write passed and failed records
        clone_count = pass_count = fail_count = 0
        if clones:
            for clone in clones.values():
                clone_count += 1
                for i, rec in enumerate(clone):
                    rec.annotations['CLONE'] = clone_count
                    pass_writer.writerow(rec.toDict())
                    pass_count += 1
                    #result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction)

        else:
            for i, rec in enumerate(result.data):
                fail_writer.writerow(rec.toDict())
                fail_count += 1
                #result.log['CLONE0-%i' % (i + 1)] = str(rec.junction)

        # Print final progress
        printProgress(row_count, result_count, 0.05, start_time)

        # Close file handles
        pass_handle.close()
        if fail_handle is not None: fail_handle.close()
        if log_handle is not None: log_handle.close()

        # Update return list
        log = OrderedDict()
        log['OUTPUT'] = os.path.basename(pass_handle.name)
        log['CLONES'] = clone_count
        log['RECORDS'] = rec_count
        log['PASS'] = pass_count
        log['FAIL'] = fail_count
        collect_dict = {'log': log, 'out_files': [pass_handle.name]}
        collect_queue.put(collect_dict)
    except:
        alive.value = False
        raise

    return None
Example #11
0
def collectDbQueue(alive,
                   result_queue,
                   collect_queue,
                   db_file,
                   task_label,
                   out_args,
                   add_fields=None):
    """
    Pulls from results queue, assembles results and manages log and file IO

    Arguments:
      alive : multiprocessing.Value boolean controlling whether processing
              continues; when False function returns
      result_queue : multiprocessing.Queue holding worker results
      collect_queue : multiprocessing.Queue to store collector return values
      db_file : Database file name
      task_label : Task label used to tag the output files
      out_args : Common output argument dictionary from parseCommonArgs
      add_fields : List of fields added to the writer not present in the in_file;
                 if None do not add fields

    Returns:
      None : Adds a dictionary with key value pairs to collect_queue containing
            'log' defining a log object,
            'out_files' defining the output file names
    """
    try:
        result_count = countDbFile(db_file)

        # Define output format
        out_type = getFileType(db_file) if out_args['out_type'] is None \
                   else out_args['out_type']

        # Defined valid alignment output handle
        pass_handle = getOutputHandle(db_file,
                                      '%s-pass' % task_label,
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_type)
        pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields)
        # Defined failed alignment output handle
        if out_args['failed']:
            fail_handle = getOutputHandle(db_file,
                                          '%s-fail' % task_label,
                                          out_dir=out_args['out_dir'],
                                          out_name=out_args['out_name'],
                                          out_type=out_type)
            fail_writer = getDbWriter(fail_handle, db_file)
        else:
            fail_handle = None

        # Define log handle
        if out_args['log_file'] is None:
            log_handle = None
        else:
            log_handle = open(out_args['log_file'], 'w')
    except:
        alive.value = False
        raise

    try:
        # Iterator over results queue until sentinel object reached
        start_time = time()
        set_count = rec_count = pass_count = fail_count = 0
        while alive.value:
            # Get result from queue
            if result_queue.empty(): continue
            else: result = result_queue.get()
            # Exit upon reaching sentinel
            if result is None: break

            # Print progress for previous iteration
            printProgress(pass_count, result_count, 0.05, start_time)

            # Update counts for current iteration
            set_count += 1
            rec_count += result.data_count

            # Write log
            if result.log is not None:
                printLog(result.log, handle=log_handle)

            # Write alignments
            if result:
                pass_count += result.data_count
                if isinstance(result.results, IgRecord):
                    pass_writer.writerow(result.results.toDict())
                else:
                    for rec in result.results:
                        pass_writer.writerow(rec.toDict())
            else:
                fail_count += result.data_count
                if fail_handle is not None:
                    if isinstance(result.data, IgRecord):
                        pass_writer.writerow(result.data.toDict())
                    else:
                        for rec in result.data:
                            fail_writer.writerow(rec.toDict())
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None

        # Print total counts
        printProgress(pass_count, result_count, 0.05, start_time)

        # Update return values
        log = OrderedDict()
        log['OUTPUT'] = os.path.basename(pass_handle.name)
        log['RECORDS'] = rec_count
        log['GROUPS'] = set_count
        log['PASS'] = pass_count
        log['FAIL'] = fail_count
        collect_dict = {'log': log, 'out_files': [pass_handle.name]}
        collect_queue.put(collect_dict)

        # Close file handles
        pass_handle.close()
        if fail_handle is not None: fail_handle.close()
        if log_handle is not None: log_handle.close()
    except:
        alive.value = False
        raise

    return None
Example #12
0
def addDbFile(db_file, fields, values, out_args=default_out_args):
    """
    Adds field and value pairs to a database file

    Arguments:
    db_file = the database file name
    fields = a list of fields to add
    values = a list of values to assign to all rows of each field
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    the output file name
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'add'
    log['FILE'] = os.path.basename(db_file)
    log['FIELDS'] = ','.join(fields)
    log['VALUES'] = ','.join(values)
    printLog(log)

    # Open file handles
    db_iter = readDbFile(db_file, ig=False)
    pass_handle = getOutputHandle(db_file,
                                  out_label='parse-add',
                                  out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'],
                                  out_type='tab')
    pass_writer = getDbWriter(pass_handle, db_file, add_fields=fields)
    # Count records
    result_count = countDbFile(db_file)

    # Define fields and values to append
    add_dict = {
        k: v
        for k, v in zip(fields, values) if k not in db_iter.fieldnames
    }

    # Iterate over records
    start_time = time()
    rec_count = 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time)
        rec_count += 1
        # Write updated row
        rec.update(add_dict)
        pass_writer.writerow(rec)

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()

    return pass_handle.name
Example #13
0
def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True,
            score_fields=False, region_fields=False, out_args=default_out_args):
    """
    Writes tab-delimited database file in output directory
    
    Arguments:
    db_gen = a generator of IgRecord objects containing alignment data
    file_prefix = directory and prefix for CLIP tab-delim file
    total_count = number of records (for progress bar)
    id_dict = a dictionary of {IMGT ID: full seq description}
    no_parse = if ID is to be parsed for pRESTO output with default delimiters
    score_fields = if True add alignment score fields to output file
    region_fields = if True add FWR and CDR region fields to output file
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    None
    """
    pass_file = "%s_db-pass.tab" % file_prefix
    fail_file = "%s_db-fail.tab" % file_prefix
    ordered_fields = ['SEQUENCE_ID',
                      'SEQUENCE_INPUT',
                      'FUNCTIONAL',
                      'IN_FRAME',
                      'STOP',
                      'MUTATED_INVARIANT',
                      'INDELS',
                      'V_CALL',
                      'D_CALL',
                      'J_CALL',
                      'SEQUENCE_VDJ',
                      'SEQUENCE_IMGT',
                      'V_SEQ_START',
                      'V_SEQ_LENGTH',
                      'V_GERM_START_VDJ',
                      'V_GERM_LENGTH_VDJ',
                      'V_GERM_START_IMGT',
                      'V_GERM_LENGTH_IMGT',
                      'N1_LENGTH',
                      'D_SEQ_START',
                      'D_SEQ_LENGTH',
                      'D_GERM_START',
                      'D_GERM_LENGTH',
                      'N2_LENGTH',
                      'J_SEQ_START',
                      'J_SEQ_LENGTH',
                      'J_GERM_START',
                      'J_GERM_LENGTH',
                      'JUNCTION_LENGTH',
                      'JUNCTION']

    if score_fields:
        ordered_fields.extend(['V_SCORE',
                               'V_IDENTITY',
                               'V_EVALUE',
                               'V_BTOP',
                               'J_SCORE',
                               'J_IDENTITY',
                               'J_EVALUE',
                               'J_BTOP'])

    if region_fields:
        ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT',
                               'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT'])


    # TODO:  This is not the best approach. should pass in output fields.
    # Initiate passed handle
    pass_handle = None

    # Open failed file
    if out_args['failed']:
        fail_handle = open(fail_file, 'wt')
        fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT'])
    else:
        fail_handle = None
        fail_writer = None

    # Initialize counters and file
    pass_writer = None
    start_time = time()
    rec_count = pass_count = fail_count = 0
    for record in db_gen:
        #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
        printProgress(rec_count, total_count, 0.05, start_time)
        rec_count += 1

        # Count pass or fail
        if (record.v_call == 'None' and record.j_call == 'None') or \
                record.functional is None or \
                not record.seq_vdj or \
                not record.junction:
            # print(record.v_call, record.j_call, record.functional, record.junction)
            fail_count += 1
            if fail_writer is not None: fail_writer.writerow(record.toDict())
            continue
        else: 
            pass_count += 1
            
        # Build sample sequence description
        if record.id in id_dict:
            record.id = id_dict[record.id]

        # Parse sequence description into new columns
        if not no_parse:
            record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
            record.id = record.annotations['ID']
            del record.annotations['ID']

        # TODO:  This is not the best approach. should pass in output fields.
        # If first sequence, use parsed description to create new columns and initialize writer
        if pass_writer is None:
            if not no_parse:  ordered_fields.extend(list(record.annotations.keys()))
            pass_handle = open(pass_file, 'wt')
            pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields)

        # Write row to tab-delim CLIP file
        pass_writer.writerow(record.toDict())
    
    # Print log
    #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
    printProgress(rec_count, total_count, 0.05, start_time)

    log = OrderedDict()
    log['OUTPUT'] = pass_file
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'MakeDb'
    printLog(log)
    
    if pass_handle is not None: pass_handle.close()
    if fail_handle is not None: fail_handle.close()
Example #14
0
def assembleCloneGermline(db_file,
                          repo,
                          seq_field=default_seq_field,
                          v_field=default_v_field,
                          germ_types=default_germ_types,
                          out_args=default_out_args):
    """
    Assemble one germline sequence for each clone in a tab-delimited database file

    Arguments:
    db_file = input tab-delimited database file
    repo = folder with germline repertoire files
    germ_types = types of germline sequences to be output
                 (full germline, D-region masked, only V-region germline)
    v_field = field in which to look for V call
    seq_field = field in which to look for sequence
    out_args = arguments for output preferences

    Returns:
    None
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'CreateGermlines'
    log['DB_FILE'] = os.path.basename(db_file)
    log['GERM_TYPES'] = germ_types if isinstance(germ_types,
                                                 str) else ','.join(germ_types)
    log['CLONED'] = 'True'
    log['V_FIELD'] = v_field
    log['SEQ_FIELD'] = seq_field
    printLog(log)

    # Get repertoire and open Db reader
    references = readRepo(repo)
    reader = readDbFile(db_file, ig=False)

    # Exit if V call field does not exist in reader
    if v_field not in reader.fieldnames:
        sys.exit('Error: V field does not exist in input database file.')

    # Define log handle
    if out_args['log_file'] is None:
        log_handle = None
    else:
        log_handle = open(out_args['log_file'], 'w')

    add_fields = []
    seq_type = seq_field.split('_')[-1]
    if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type]
    if 'dmask' in germ_types:
        add_fields += ['GERMLINE_' + seq_type + '_D_MASK']
    if 'vonly' in germ_types:
        add_fields += ['GERMLINE_' + seq_type + '_V_REGION']
    if 'regions' in germ_types: add_fields += ['GERMLINE_REGIONS']

    add_fields += ['GERMLINE_V_CALL']
    add_fields += ['GERMLINE_D_CALL']
    add_fields += ['GERMLINE_J_CALL']

    # Create output file handle and Db writer
    writers = {}
    pass_handle = getOutputHandle(db_file,
                                  'germ-pass',
                                  out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'],
                                  out_type=out_args['out_type'])
    writers['pass'] = getDbWriter(pass_handle, db_file, add_fields=add_fields)

    if out_args['failed']:
        fail_handle = getOutputHandle(db_file,
                                      'germ-fail',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
        writers['fail'] = getDbWriter(fail_handle,
                                      db_file,
                                      add_fields=add_fields)
    else:
        fail_handle = None
        writers['fail'] = None

    # Initialize time and total count for progress bar
    start_time = time()
    rec_count = countDbFile(db_file)
    counts = {}
    clone_count = counts['pass'] = counts['fail'] = 0
    # Iterate over rows
    clone = 'initial'
    clone_dict = OrderedDict()
    for i, row in enumerate(reader):
        # Print progress
        printProgress(i, rec_count, 0.05, start_time)

        # Clone isn't over yet
        if row.get('CLONE', '') == clone:
            clone_dict[i] = row
        # Clone just finished
        elif clone_dict:
            clone_count += 1
            result_log = makeCloneGermline(clone, clone_dict, references,
                                           germ_types, v_field, seq_field,
                                           counts, writers, out_args)
            printLog(result_log, handle=log_handle)
            # Now deal with current row (first of next clone)
            clone = row['CLONE']
            clone_dict = OrderedDict([(i, row)])
        # Last case is only for first row of file
        else:
            clone = row['CLONE']
            clone_dict = OrderedDict([(i, row)])

    clone_count += 1
    result_log = makeCloneGermline(clone, clone_dict, references, germ_types,
                                   v_field, seq_field, counts, writers,
                                   out_args)
    printLog(result_log, handle=log_handle)

    # Print log
    printProgress(i + 1, rec_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['CLONES'] = clone_count
    log['RECORDS'] = rec_count
    log['PASS'] = counts['pass']
    log['FAIL'] = counts['fail']
    log['END'] = 'CreateGermlines'
    printLog(log)

    # Close file handles
    pass_handle.close()
    if fail_handle is not None: fail_handle.close()
    if log_handle is not None: log_handle.close()
Example #15
0
def assembleEachGermline(db_file,
                         repo,
                         germ_types,
                         v_field,
                         seq_field,
                         out_args=default_out_args):
    """
    Write germline sequences to tab-delimited database file

    Arguments:
    db_file = input tab-delimited database file
    repo = folder with germline repertoire files
    germ_types = types of germline sequences to be output
                     (full germline, D-region masked, only V-region germline)
    v_field = field in which to look for V call
    seq_field = field in which to look for sequence
    out_args = arguments for output preferences

    Returns:
    None
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'CreateGermlines'
    log['DB_FILE'] = os.path.basename(db_file)
    log['GERM_TYPES'] = germ_types if isinstance(germ_types,
                                                 str) else ','.join(germ_types)
    log['CLONED'] = 'False'
    log['V_FIELD'] = v_field
    log['SEQ_FIELD'] = seq_field
    printLog(log)

    # Get repertoire and open Db reader
    references = readRepo(repo)
    reader = readDbFile(db_file, ig=False)

    # Exit if V call field does not exist in reader
    if v_field not in reader.fieldnames:
        sys.exit('Error: V field does not exist in input database file.')

    # Define log handle
    if out_args['log_file'] is None:
        log_handle = None
    else:
        log_handle = open(out_args['log_file'], 'w')

    add_fields = []
    seq_type = seq_field.split('_')[-1]
    if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type]
    if 'dmask' in germ_types:
        add_fields += ['GERMLINE_' + seq_type + '_D_MASK']
    if 'vonly' in germ_types:
        add_fields += ['GERMLINE_' + seq_type + '_V_REGION']
    if 'regions' in germ_types: add_fields += ['GERMLINE_REGIONS']

    # Create output file handle and Db writer
    pass_handle = getOutputHandle(db_file,
                                  'germ-pass',
                                  out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'],
                                  out_type=out_args['out_type'])
    pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields)

    if out_args['failed']:
        fail_handle = getOutputHandle(db_file,
                                      'germ-fail',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
        fail_writer = getDbWriter(fail_handle, db_file, add_fields=add_fields)
    else:
        fail_handle = None
        fail_writer = None

    # Initialize time and total count for progress bar
    start_time = time()
    rec_count = countDbFile(db_file)
    pass_count = fail_count = 0
    # Iterate over rows
    for i, row in enumerate(reader):
        # Print progress
        printProgress(i, rec_count, 0.05, start_time)

        result_log, germlines = joinGermline(row,
                                             references,
                                             seq_field=seq_field,
                                             v_field=v_field,
                                             germ_types=germ_types)

        # Add germline field(s) to dictionary
        if 'full' in germ_types:
            row['GERMLINE_' + seq_type] = germlines['full']
        if 'dmask' in germ_types:
            row['GERMLINE_' + seq_type + '_D_MASK'] = germlines['dmask']
        if 'vonly' in germ_types:
            row['GERMLINE_' + seq_type + '_V_REGION'] = germlines['vonly']
        if 'regions' in germ_types:
            row['GERMLINE_REGIONS'] = germlines['regions']

        # Write row to pass or fail file
        if 'ERROR' in result_log:
            fail_count += 1
            if fail_writer is not None: fail_writer.writerow(row)
        else:
            result_log['SEQUENCE'] = row[seq_field]
            result_log['GERMLINE'] = germlines['full']
            result_log['REGIONS'] = germlines['regions']

            pass_count += 1
            pass_writer.writerow(row)
        printLog(result_log, handle=log_handle)

    # Print log
    printProgress(i + 1, rec_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'CreateGermlines'
    printLog(log)

    # Close file handles
    pass_handle.close()
    if fail_handle is not None: fail_handle.close()
    if log_handle is not None: log_handle.close()
def collectDbQueue(alive, result_queue, collect_queue, db_file, task_label, out_args,
                   add_fields=None):
    """
    Pulls from results queue, assembles results and manages log and file IO

    Arguments:
      alive : multiprocessing.Value boolean controlling whether processing
              continues; when False function returns
      result_queue : multiprocessing.Queue holding worker results
      collect_queue : multiprocessing.Queue to store collector return values
      db_file : Database file name
      task_label : Task label used to tag the output files
      out_args : Common output argument dictionary from parseCommonArgs
      add_fields : List of fields added to the writer not present in the in_file;
                 if None do not add fields

    Returns:
      None : Adds a dictionary with key value pairs to collect_queue containing
            'log' defining a log object,
            'out_files' defining the output file names
    """
    try:
        result_count = countDbFile(db_file)

        # Define output format
        out_type = getFileType(db_file) if out_args['out_type'] is None \
                   else out_args['out_type']

        # Defined valid alignment output handle
        pass_handle = getOutputHandle(db_file,
                                      '%s-pass' % task_label,
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_type)
        pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields)
        # Defined failed alignment output handle
        if out_args['failed']:
            fail_handle = getOutputHandle(db_file,
                                          '%s-fail'  % task_label,
                                          out_dir=out_args['out_dir'],
                                          out_name=out_args['out_name'],
                                          out_type=out_type)
            fail_writer = getDbWriter(fail_handle, db_file)
        else:
            fail_handle = None

        # Define log handle
        if out_args['log_file'] is None:
            log_handle = None
        else:
            log_handle = open(out_args['log_file'], 'w')
    except:
        alive.value = False
        raise

    try:
        # Iterator over results queue until sentinel object reached
        start_time = time()
        set_count = rec_count = pass_count = fail_count = 0
        while alive.value:
            # Get result from queue
            if result_queue.empty():  continue
            else:  result = result_queue.get()
            # Exit upon reaching sentinel
            if result is None:  break

            # Print progress for previous iteration
            printProgress(pass_count, result_count, 0.05, start_time)

            # Update counts for current iteration
            set_count += 1
            rec_count += result.data_count

            # Write log
            printLog(result.log, handle=log_handle)

            # Write alignments
            if result:
                pass_count += result.data_count
                for rec in result.results:
                    pass_writer.writerow(rec.toDict())
            else:
                fail_count += result.data_count
                if fail_handle is not None:
                    for rec in result.data:
                        pass_writer.writerow(rec.toDict())
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None

        # Print total counts
        printProgress(pass_count, result_count, 0.05, start_time)

        # Update return values
        log = OrderedDict()
        log['OUTPUT'] = os.path.basename(pass_handle.name)
        log['RECORDS'] = rec_count
        log['GROUPS'] = set_count
        log['PASS'] = pass_count
        log['FAIL'] = fail_count
        collect_dict = {'log':log, 'out_files': [pass_handle.name]}
        collect_queue.put(collect_dict)

        # Close file handles
        pass_handle.close()
        if fail_handle is not None:  fail_handle.close()
        if log_handle is not None:  log_handle.close()
    except:
        alive.value = False
        raise

    return None
Example #17
0
def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False,
            out_args=default_out_args):
    """
    Writes tab-delimited database file in output directory.
    
    Arguments:
      db : a iterator of IgRecord objects containing alignment data.
      fields : a list of ordered field names to write.
      file_prefix : directory and prefix for CLIP tab-delim file.
      total_count : number of records (for progress bar).
      id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.
      no_parse : if ID is to be parsed for pRESTO output with default delimiters.
      partial : if True put incomplete alignments in the pass file.
      out_args : common output argument dictionary from parseCommonArgs.

    Returns:
      None
    """
    # Function to check for valid records strictly
    def _pass_strict(rec):
        valid = [rec.v_call and rec.v_call != 'None',
                 rec.j_call and rec.j_call != 'None',
                 rec.functional is not None,
                 rec.seq_vdj,
                 rec.junction]
        return all(valid)

    # Function to check for valid records loosely
    def _pass_gentle(rec):
        valid = [rec.v_call and rec.v_call != 'None',
                 rec.d_call and rec.d_call != 'None',
                 rec.j_call and rec.j_call != 'None']
        return any(valid)

    # Set pass criteria
    _pass = _pass_gentle if partial else _pass_strict

    # Define output file names
    pass_file = '%s_db-pass.tab' % file_prefix
    fail_file = '%s_db-fail.tab' % file_prefix

    # Initiate handles, writers and counters
    pass_handle = None
    fail_handle = None
    pass_writer = None
    fail_writer = None
    start_time = time()
    rec_count = pass_count = fail_count = 0

    # Validate and write output
    printProgress(0, total_count, 0.05, start_time)
    for i, record in enumerate(db, start=1):

        # Replace sequence description with full string, if required
        if id_dict is not None and record.id in id_dict:
            record.id = id_dict[record.id]

        # Parse sequence description into new columns
        if not no_parse:
            try:
                record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
                record.id = record.annotations['ID']
                del record.annotations['ID']

                # TODO:  This is not the best approach. should pass in output fields.
                # If first record, use parsed description to define extra columns
                if i == 1:  fields.extend(list(record.annotations.keys()))
            except IndexError:
                # Could not parse pRESTO-style annotations so fall back to no parse
                no_parse = True
                sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n')

        # Count pass or fail and write to appropriate file
        if _pass(record):
            # Open pass file
            if pass_writer is None:
                pass_handle = open(pass_file, 'wt')
                pass_writer = getDbWriter(pass_handle, add_fields=fields)

            # Write row to pass file
            pass_count += 1
            pass_writer.writerow(record.toDict())
        else:
            # Open failed file
            if out_args['failed'] and fail_writer is None:
                fail_handle = open(fail_file, 'wt')
                fail_writer = getDbWriter(fail_handle, add_fields=fields)

            # Write row to fail file if specified
            fail_count += 1
            if fail_writer is not None:
                fail_writer.writerow(record.toDict())

        # Print progress
        printProgress(i, total_count, 0.05, start_time)

    # Print consol log
    log = OrderedDict()
    log['OUTPUT'] = pass_file
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'MakeDb'
    printLog(log)
    
    if pass_handle is not None: pass_handle.close()
    if fail_handle is not None: fail_handle.close()
def collectQueueClust(alive, result_queue, collect_queue, db_file, out_args, cluster_func, cluster_args):
    """
    Assembles results from a queue of individual sequence results and manages log/file I/O

    Arguments: 
    alive = a multiprocessing.Value boolean controlling whether processing continues
            if False exit process
    result_queue = a multiprocessing.Queue holding processQueue results
    collect_queue = a multiprocessing.Queue to store collector return values
    db_file = the input database file name
    out_args = common output argument dictionary from parseCommonArgs
    cluster_func = the function to call for carrying out clustering on distance matrix
    cluster_args = a dictionary of arguments to pass to cluster_func
    
    Returns: 
    None
    (adds 'log' and 'out_files' to collect_dict)
    """
    # Open output files
    try:
               
        # Iterate over Ig records to count and order by junction length
        result_count = 0
        records = {}
        # print 'Reading file...'
        db_iter = readDbFile(db_file)
        for rec in db_iter:
            records[rec.id] = rec
            result_count += 1
        records = OrderedDict(sorted(list(records.items()), key=lambda i: i[1].junction_length))
                
        # Define empty matrix to store assembled results
        dist_mat = np.zeros((result_count,result_count))
        
        # Count records and define output format 
        out_type = getFileType(db_file) if out_args['out_type'] is None \
                   else out_args['out_type']
                   
        # Defined successful output handle
        pass_handle = getOutputHandle(db_file, 
                                      out_label='clone-pass', 
                                      out_dir=out_args['out_dir'], 
                                      out_name=out_args['out_name'], 
                                      out_type=out_type)
        pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE')
        
        # Defined failed cloning output handle
        if out_args['failed']:
            fail_handle = getOutputHandle(db_file,
                                          out_label='clone-fail', 
                                          out_dir=out_args['out_dir'], 
                                          out_name=out_args['out_name'], 
                                          out_type=out_type)
            fail_writer = getDbWriter(fail_handle, db_file)
        else:
            fail_handle = None
            fail_writer = None

        # Open log file
        if out_args['log_file'] is None:
            log_handle = None
        else:
            log_handle = open(out_args['log_file'], 'w')
    except:
        alive.value = False
        raise
    
    try:
        # Iterator over results queue until sentinel object reached
        start_time = time()
        row_count = rec_count = 0
        while alive.value:
            # Get result from queue
            if result_queue.empty():  continue
            else:  result = result_queue.get()
            # Exit upon reaching sentinel
            if result is None:  break

            # Print progress for previous iteration
            if row_count == 0:
                print('PROGRESS> Assigning clones')
            printProgress(row_count, result_count, 0.05, start_time)
            
            # Update counts for iteration
            row_count += 1
            rec_count += len(result)
            
            # Add result row to distance matrix
            if result:
                dist_mat[list(range(result_count-len(result),result_count)),result_count-len(result)] = result.results
                
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None    
        
        # Calculate linkage and carry out clustering
        # print dist_mat
        clusters = cluster_func(dist_mat, **cluster_args) if dist_mat is not None else None
        clones = {}
        # print clusters
        for i, c in enumerate(clusters):
            clones.setdefault(c, []).append(records[list(records.keys())[i]])
        
        # Write passed and failed records
        clone_count = pass_count = fail_count = 0
        if clones:
            for clone in clones.values():
                clone_count += 1
                for i, rec in enumerate(clone):
                    rec.annotations['CLONE'] = clone_count
                    pass_writer.writerow(rec.toDict())
                    pass_count += 1
                    #result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction)

        else:
            for i, rec in enumerate(result.data):
                fail_writer.writerow(rec.toDict())
                fail_count += 1
                #result.log['CLONE0-%i' % (i + 1)] = str(rec.junction)
        
        # Print final progress
        printProgress(row_count, result_count, 0.05, start_time)
    
        # Close file handles
        pass_handle.close()
        if fail_handle is not None:  fail_handle.close()
        if log_handle is not None:  log_handle.close()
                
        # Update return list
        log = OrderedDict()
        log['OUTPUT'] = os.path.basename(pass_handle.name)
        log['CLONES'] = clone_count
        log['RECORDS'] = rec_count
        log['PASS'] = pass_count
        log['FAIL'] = fail_count
        collect_dict = {'log':log, 'out_files': [pass_handle.name]}
        collect_queue.put(collect_dict)
    except:
        alive.value = False
        raise
    
    return None
def assembleEachGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args):
    """
    Write germline sequences to tab-delimited database file
    
    Arguments:
    db_file = input tab-delimited database file
    repo = folder with germline repertoire files
    germ_types = types of germline sequences to be output
                     (full germline, D-region masked, only V-region germline)
    v_field = field in which to look for V call
    seq_field = field in which to look for sequence
    out_args = arguments for output preferences
    
    Returns:
    None
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'CreateGermlines'
    log['DB_FILE'] = os.path.basename(db_file)
    log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types)
    log['CLONED'] = 'False'
    log['V_FIELD'] = v_field
    log['SEQ_FIELD'] = seq_field
    printLog(log)
    
    # Get repertoire and open Db reader
    repo_dict = getRepo(repo)
    reader = readDbFile(db_file, ig=False)

    # Exit if V call field does not exist in reader
    if v_field not in reader.fieldnames:
        sys.exit('Error: V field does not exist in input database file.')
    
    # Define log handle
    if out_args['log_file'] is None:  
        log_handle = None
    else:  
        log_handle = open(out_args['log_file'], 'w')

    add_fields = []
    seq_type = seq_field.split('_')[-1]
    if 'full' in germ_types: add_fields +=  ['GERMLINE_' + seq_type]
    if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK']
    if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION']

    # Create output file handle and Db writer
    pass_handle = getOutputHandle(db_file, 'germ-pass',
                                  out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'],
                                  out_type=out_args['out_type'])
    pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields)

    if out_args['failed']:
        fail_handle = getOutputHandle(db_file, 'germ-fail',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_args['out_type'])
        fail_writer = getDbWriter(fail_handle, db_file, add_fields=add_fields)
    else:
        fail_handle = None
        fail_writer = None

    # Initialize time and total count for progress bar
    start_time = time()
    rec_count = countDbFile(db_file)
    pass_count = fail_count = 0
    # Iterate over rows
    for i,row in enumerate(reader):
        # Print progress
        printProgress(i, rec_count, 0.05, start_time)
        
        result_log, germlines = joinGermline(row, repo_dict, germ_types, v_field, seq_field)
        
        # Add germline field(s) to dictionary
        if 'full' in germ_types: row['GERMLINE_' + seq_type] = germlines['full']
        if 'dmask' in germ_types: row['GERMLINE_' + seq_type + '_D_MASK'] = germlines['dmask']
        if 'vonly' in germ_types: row['GERMLINE_' + seq_type + '_V_REGION'] = germlines['vonly']

        # Write row to pass or fail file
        if 'ERROR' in result_log:
            fail_count += 1
            if fail_writer is not None: fail_writer.writerow(row)
        else:
            result_log['SEQUENCE'] = row[seq_field]
            result_log['GERMLINE'] = germlines['full']
            result_log['REGIONS'] = germlines['regions']
            
            pass_count += 1
            pass_writer.writerow(row)
        printLog(result_log, handle=log_handle)
    
    # Print log
    printProgress(i+1, rec_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['PASS'] = pass_count
    log['FAIL'] = fail_count
    log['END'] = 'CreateGermlines'
    printLog(log)
        
    # Close file handles
    pass_handle.close()
    if fail_handle is not None: fail_handle.close()
    if log_handle is not None:  log_handle.close()
def selectDbFile(db_file, fields, values, logic='any', regex=False,
                 out_args=default_out_args):
    """
    Selects records from a database file

    Arguments:
    db_file = the database file name
    fields = a list of fields to check for selection criteria
    values = a list of values defining selection targets
    logic = one of 'any' or 'all' defining whether one or all fields must have a match.
    regex = if False do exact full string matches; if True allow partial regex matches.
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    the output file name
    """
    # Define string match function
    if regex:
        def _match_func(x, patterns):  return any([re.search(p, x) for p in patterns])
    else:
        def _match_func(x, patterns):  return x in patterns

    # Define logic function
    if logic == 'any':
        _logic_func = any
    elif logic == 'all':
        _logic_func = all

    # Print console log
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'select'
    log['FILE'] = os.path.basename(db_file)
    log['FIELDS'] = ','.join(fields)
    log['VALUES'] = ','.join(values)
    log['REGEX'] =regex
    printLog(log)

    # Open file handles
    db_iter = readDbFile(db_file, ig=False)
    pass_handle = getOutputHandle(db_file, out_label='parse-select', out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'], out_type='tab')
    pass_writer = getDbWriter(pass_handle, db_file)
    # Count records
    result_count = countDbFile(db_file)

    # Iterate over records
    start_time = time()
    rec_count = pass_count = fail_count = 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time)
        rec_count += 1

        # Check for selection values in all fields
        select = _logic_func([_match_func(rec.get(f, False), values) for f in fields])

        # Write sequences
        if select:
            pass_count += 1
            pass_writer.writerow(rec)
        else:
            fail_count += 1

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['SELECTED'] = pass_count
    log['DISCARDED'] = fail_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()

    return pass_handle.name
def sortDbFile(db_file, field, numeric=False, descend=False,
               out_args=default_out_args):
    """
    Sorts records by values in an annotation field

    Arguments:
    db_file = the database filename
    field = the field name to sort by
    numeric = if True sort field numerically;
              if False sort field alphabetically
    descend = if True sort in descending order;
              if False sort in ascending order

    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    the output file name
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'sort'
    log['FILE'] = os.path.basename(db_file)
    log['FIELD'] = field
    log['NUMERIC'] = numeric
    printLog(log)

    # Open file handles
    db_iter = readDbFile(db_file, ig=False)
    pass_handle = getOutputHandle(db_file, out_label='parse-sort', out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'], out_type='tab')
    pass_writer = getDbWriter(pass_handle, db_file)


    # Store all records in a dictionary
    start_time = time()
    printMessage("Indexing: Running", start_time=start_time)
    db_dict = {i:r for i, r in enumerate(db_iter)}
    result_count = len(db_dict)

    # Sort db_dict by field values
    tag_dict = {k:v[field] for k, v in db_dict.items()}
    if numeric:  tag_dict = {k:float(v or 0) for k, v in tag_dict.items()}
    sorted_keys = sorted(tag_dict, key=tag_dict.get, reverse=descend)
    printMessage("Indexing: Done", start_time=start_time, end=True)

    # Iterate over records
    start_time = time()
    rec_count = 0
    for key in sorted_keys:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time)
        rec_count += 1

        # Write records
        pass_writer.writerow(db_dict[key])

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()

    return pass_handle.name
def assembleCloneGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args):
    """
    Assemble one germline sequence for each clone in a tab-delimited database file
    
    Arguments:
    db_file = input tab-delimited database file
    repo = folder with germline repertoire files
    germ_types = types of germline sequences to be output
                     (full germline, D-region masked, only V-region germline)
    v_field = field in which to look for V call
    seq_field = field in which to look for sequence
    out_args = arguments for output preferences
    
    Returns:
    None
    """
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'CreateGermlines'
    log['DB_FILE'] = os.path.basename(db_file)
    log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types)
    log['CLONED'] = 'True'
    log['V_FIELD'] = v_field
    log['SEQ_FIELD'] = seq_field
    printLog(log)
    
    # Get repertoire and open Db reader
    repo_dict = getRepo(repo)
    reader = readDbFile(db_file, ig=False)

    # Exit if V call field does not exist in reader
    if v_field not in reader.fieldnames:
        sys.exit('Error: V field does not exist in input database file.')
    
    # Define log handle
    if out_args['log_file'] is None:  
        log_handle = None
    else:  
        log_handle = open(out_args['log_file'], 'w')

    add_fields = []
    seq_type = seq_field.split('_')[-1]
    if 'full' in germ_types: add_fields +=  ['GERMLINE_' + seq_type]
    if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK']
    if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION']

    # Create output file handle and Db writer
    writers = {}
    pass_handle = getOutputHandle(db_file, 'germ-pass', out_dir=out_args['out_dir'],
                                 out_name=out_args['out_name'], out_type=out_args['out_type'])
    writers['pass'] = getDbWriter(pass_handle, db_file, add_fields=add_fields)

    if out_args['failed']:
        fail_handle = getOutputHandle(db_file, 'germ-fail', out_dir=out_args['out_dir'],
                                     out_name=out_args['out_name'], out_type=out_args['out_type'])
        writers['fail'] = getDbWriter(fail_handle, db_file, add_fields=add_fields)
    else:
        fail_handle = None
        writers['fail'] = None

    # Initialize time and total count for progress bar
    start_time = time()
    rec_count = countDbFile(db_file)
    counts = {}
    clone_count = counts['pass'] = counts['fail'] = 0
    # Iterate over rows
    clone = 'initial'
    clone_dict = OrderedDict()
    for i,row in enumerate(reader):
        # Print progress
        printProgress(i, rec_count, 0.05, start_time)
        
        # Clone isn't over yet
        if row.get('CLONE','') == clone: 
            clone_dict[row["SEQUENCE_ID"]] = row
        # Clone just finished
        elif clone_dict:
            clone_count += 1
            result_log = makeCloneGermline(clone, clone_dict, repo_dict, germ_types,
                                           v_field, seq_field, counts, writers, out_args)
            printLog(result_log, handle=log_handle)
            # Now deal with current row (first of next clone)
            clone = row['CLONE']
            clone_dict = OrderedDict([(row['SEQUENCE_ID'],row)])
        # Last case is only for first row of file
        else:
            clone = row['CLONE']
            clone_dict = OrderedDict([(row['SEQUENCE_ID'],row)])
    clone_count += 1
    result_log = makeCloneGermline(clone, clone_dict, repo_dict, germ_types, v_field,
                                   seq_field, counts, writers, out_args)
    printLog(result_log, handle=log_handle)
    
    # Print log
    printProgress(i+1, rec_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['CLONES'] = clone_count
    log['RECORDS'] = rec_count
    log['PASS'] = counts['pass']
    log['FAIL'] = counts['fail']
    log['END'] = 'CreateGermlines'
    printLog(log)
        
    # Close file handles
    pass_handle.close()
    if fail_handle is not None: fail_handle.close()
    if log_handle is not None:  log_handle.close()
def splitDbFile(db_file, field, num_split=None, out_args=default_out_args):
    """
    Divides a tab-delimited database file into segments by description tags

    Arguments:
    db_file = filename of the tab-delimited database file to split
    field = the field name by which to split db_file
    num_split = the numerical threshold by which to group sequences;
                if None treat field as textual
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    a list of output file names
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'split'
    log['FILE'] = os.path.basename(db_file)
    log['FIELD'] = field
    log['NUM_SPLIT'] = num_split
    printLog(log)

    # Open IgRecord reader iter object
    reader = readDbFile(db_file, ig=False)

    # Determine total numbers of records
    rec_count = countDbFile(db_file)

    start_time = time()
    count = 0
    # Sort records into files based on textual field
    if num_split is None:
        # Create set of unique field tags
        tmp_iter = readDbFile(db_file, ig=False)
        tag_list = list(set([row[field] for row in tmp_iter]))

        # Forbidden characters in filename and replacements
        noGood = {'\/':'f','\\':'b','?':'q','\%':'p','*':'s',':':'c',
                  '\|':'pi','\"':'dq','\'':'sq','<':'gt','>':'lt',' ':'_'}
        # Replace forbidden characters in tag_list
        tag_dict = {}
        for tag in tag_list:
            for c,r in noGood.items():
                tag_dict[tag] = (tag_dict.get(tag, tag).replace(c,r) \
                                     if c in tag else tag_dict.get(tag, tag))

        # Create output handles
        handles_dict = {tag:getOutputHandle(db_file,
                                            '%s-%s' % (field, label),
                                            out_type = out_args['out_type'],
                                            out_name = out_args['out_name'],
                                            out_dir = out_args['out_dir'])
                        for tag, label in tag_dict.items()}

        # Create Db writer instances
        writers_dict = {tag:getDbWriter(handles_dict[tag], db_file)
                        for tag in tag_dict}

        # Iterate over IgRecords
        for row in reader:
            printProgress(count, rec_count, 0.05, start_time)
            count += 1
            # Write row to appropriate file
            tag = row[field]
            writers_dict[tag].writerow(row)

    # Sort records into files based on numeric num_split
    else:
        num_split = float(num_split)

        # Create output handles
        handles_dict = {'under':getOutputHandle(db_file,
                                                'under-%.1f' % num_split,
                                                out_type = out_args['out_type'],
                                                out_name = out_args['out_name'],
                                                out_dir = out_args['out_dir']),
                        'atleast':getOutputHandle(db_file,
                                                  'atleast-%.1f' % num_split,
                                                out_type = out_args['out_type'],
                                                out_name = out_args['out_name'],
                                                out_dir = out_args['out_dir'])}

        # Create Db writer instances
        writers_dict = {'under':getDbWriter(handles_dict['under'], db_file),
                        'atleast':getDbWriter(handles_dict['atleast'], db_file)}

        # Iterate over IgRecords
        for row in reader:
            printProgress(count, rec_count, 0.05, start_time)
            count += 1
            tag = row[field]
            tag = 'under' if float(tag) < num_split else 'atleast'
            writers_dict[tag].writerow(row)

    # Write log
    printProgress(count, rec_count, 0.05, start_time)
    log = OrderedDict()
    for i, k in enumerate(handles_dict):
        log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name)
    log['RECORDS'] = rec_count
    log['PARTS'] = len(handles_dict)
    log['END'] = 'ParseDb'
    printLog(log)

    # Close output file handles
    for t in handles_dict: handles_dict[t].close()

    return [handles_dict[t].name for t in handles_dict]
Example #24
0
def selectDbFile(db_file,
                 fields,
                 values,
                 logic='any',
                 regex=False,
                 out_args=default_out_args):
    """
    Selects records from a database file

    Arguments:
    db_file = the database file name
    fields = a list of fields to check for selection criteria
    values = a list of values defining selection targets
    logic = one of 'any' or 'all' defining whether one or all fields must have a match.
    regex = if False do exact full string matches; if True allow partial regex matches.
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    the output file name
    """
    # Define string match function
    if regex:

        def _match_func(x, patterns):
            return any([re.search(p, x) for p in patterns])
    else:

        def _match_func(x, patterns):
            return x in patterns

    # Define logic function
    if logic == 'any':
        _logic_func = any
    elif logic == 'all':
        _logic_func = all

    # Print console log
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'select'
    log['FILE'] = os.path.basename(db_file)
    log['FIELDS'] = ','.join(fields)
    log['VALUES'] = ','.join(values)
    log['REGEX'] = regex
    printLog(log)

    # Open file handles
    db_iter = readDbFile(db_file, ig=False)
    pass_handle = getOutputHandle(db_file,
                                  out_label='parse-select',
                                  out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'],
                                  out_type='tab')
    pass_writer = getDbWriter(pass_handle, db_file)
    # Count records
    result_count = countDbFile(db_file)

    # Iterate over records
    start_time = time()
    rec_count = pass_count = fail_count = 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time)
        rec_count += 1

        # Check for selection values in all fields
        select = _logic_func(
            [_match_func(rec.get(f, False), values) for f in fields])

        # Write sequences
        if select:
            pass_count += 1
            pass_writer.writerow(rec)
        else:
            fail_count += 1

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['SELECTED'] = pass_count
    log['DISCARDED'] = fail_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()

    return pass_handle.name
def updateDbFile(db_file, field, values, updates, out_args=default_out_args):
    """
    Updates field and value pairs to a database file

    Arguments:
    db_file = the database file name
    field = the field to update
    values = a list of values to specifying which rows to update
    updates = a list of values to update each value with
    out_args = common output argument dictionary from parseCommonArgs

    Returns:
    the output file name
    """
    log = OrderedDict()
    log['START'] = 'ParseDb'
    log['COMMAND'] = 'update'
    log['FILE'] = os.path.basename(db_file)
    log['FIELD'] = field
    log['VALUES'] = ','.join(values)
    log['UPDATES'] = ','.join(updates)
    printLog(log)

    # Open file handles
    db_iter = readDbFile(db_file, ig=False)
    pass_handle = getOutputHandle(db_file, out_label='parse-update', out_dir=out_args['out_dir'],
                                  out_name=out_args['out_name'], out_type='tab')
    pass_writer = getDbWriter(pass_handle, db_file)
    # Count records
    result_count = countDbFile(db_file)

    # Iterate over records
    start_time = time()
    rec_count = pass_count = 0
    for rec in db_iter:
        # Print progress for previous iteration
        printProgress(rec_count, result_count, 0.05, start_time)
        rec_count += 1

        # Updated values if found
        for x, y in zip(values, updates):
            if rec[field] == x:
                rec[field] = y
                pass_count += 1

        # Write records
        pass_writer.writerow(rec)

    # Print counts
    printProgress(rec_count, result_count, 0.05, start_time)
    log = OrderedDict()
    log['OUTPUT'] = os.path.basename(pass_handle.name)
    log['RECORDS'] = rec_count
    log['UPDATED'] = pass_count
    log['END'] = 'ParseDb'
    printLog(log)

    # Close file handles
    pass_handle.close()

    return pass_handle.name
Example #26
0
def collectQueue(alive,
                 result_queue,
                 collect_queue,
                 db_file,
                 out_args,
                 cluster_func=None,
                 cluster_args={}):
    """
    Assembles results from a queue of individual sequence results and manages log/file I/O

    Arguments: 
    alive = a multiprocessing.Value boolean controlling whether processing continues
            if False exit process
    result_queue = a multiprocessing.Queue holding processQueue results
    collect_queue = a multiprocessing.Queue to store collector return values
    db_file = the input database file name
    out_args = common output argument dictionary from parseCommonArgs
    cluster_func = the function to call for carrying out clustering on distance matrix
    cluster_args = a dictionary of arguments to pass to cluster_func
    
    Returns: 
    None
    (adds 'log' and 'out_files' to collect_dict)
    """
    # Open output files
    try:
        # Count records and define output format
        out_type = getFileType(db_file) if out_args['out_type'] is None \
                   else out_args['out_type']
        result_count = countDbFile(db_file)

        # Defined successful output handle
        pass_handle = getOutputHandle(db_file,
                                      out_label='clone-pass',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
                                      out_type=out_type)
        pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE')

        # Defined failed alignment output handle
        if out_args['failed']:
            fail_handle = getOutputHandle(db_file,
                                          out_label='clone-fail',
                                          out_dir=out_args['out_dir'],
                                          out_name=out_args['out_name'],
                                          out_type=out_type)
            fail_writer = getDbWriter(fail_handle, db_file)
        else:
            fail_handle = None
            fail_writer = None

        # Define log handle
        if out_args['log_file'] is None:
            log_handle = None
        else:
            log_handle = open(out_args['log_file'], 'w')
    except:
        #sys.stderr.write('Exception in collector file opening step\n')
        alive.value = False
        raise

    # Get results from queue and write to files
    try:
        #print 'START COLLECT', alive.value
        # Iterator over results queue until sentinel object reached
        start_time = time()
        rec_count = clone_count = pass_count = fail_count = 0
        while alive.value:
            # Get result from queue
            if result_queue.empty(): continue
            else: result = result_queue.get()
            # Exit upon reaching sentinel
            if result is None: break
            #print "COLLECT", alive.value, result['id']

            # Print progress for previous iteration and update record count
            if rec_count == 0:
                print('PROGRESS> Assigning clones')
            printProgress(rec_count, result_count, 0.05, start_time)
            rec_count += len(result.data)

            # Write passed and failed records
            if result:
                # Writing passing sequences
                for clone in result.results.values():
                    clone_count += 1
                    for i, rec in enumerate(clone, start=1):
                        rec.annotations['CLONE'] = clone_count
                        pass_writer.writerow(rec.toDict())
                        pass_count += 1
                        result.log['CLONE%i-%i' % (clone_count, i)] = str(
                            rec.junction)
                # Right failed seqeuence from passing sets
                if result.failed:
                    for i, rec in enumerate(result.failed, start=1):
                        fail_count += 1
                        if fail_writer is not None:
                            fail_writer.writerow(rec.toDict())
                        result.log['FAIL%i-%i' % (clone_count, i)] = str(
                            rec.junction)
            else:
                # Write failed sets
                for i, rec in enumerate(result.data, start=1):
                    fail_count += 1
                    if fail_writer is not None:
                        fail_writer.writerow(rec.toDict())
                    result.log['CLONE0-%i' % (i)] = str(rec.junction)

            # Write log
            printLog(result.log, handle=log_handle)
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None

        # Print total counts
        printProgress(rec_count, result_count, 0.05, start_time)

        # Close file handles
        pass_handle.close()
        if fail_handle is not None: fail_handle.close()
        if log_handle is not None: log_handle.close()

        # Update return list
        log = OrderedDict()
        log['OUTPUT'] = os.path.basename(pass_handle.name)
        log['CLONES'] = clone_count
        log['RECORDS'] = rec_count
        log['PASS'] = pass_count
        log['FAIL'] = fail_count
        collect_dict = {'log': log, 'out_files': [pass_handle.name]}
        collect_queue.put(collect_dict)
    except:
        #sys.stderr.write('Exception in collector result processing step\n')
        alive.value = False
        raise

    return None