def alignAcross(data, field_map, muscle_exec=default_muscle_exec):
    """
    Multiple aligns sequence fields column wise

    Arguments:
      data : DbData object with Receptor objects to process.
      field_map : a dictionary of {input sequence : output sequence) field names to multiple align.
      muscle_exec : the MUSCLE executable.

    Returns:
      changeo.Multiprocessing.DbResult : object containing Receptor objects with multiple aligned sequence fields.
    """
    # Define return object
    result = DbResult(data.id, data.data)
    result.results = data.data
    result.valid = True

    # Fail invalid groups
    if result.id is None:
        result.log = None
        result.valid = False
        return result

    seq_fields = list(field_map.keys())
    for f in seq_fields:
        seq_list = [
            SeqRecord(r.getSeq(f), id=r.sequence_id.replace(' ', '_'))
            for r in data.data
        ]
        seq_aln = runMuscle(seq_list, aligner_exec=muscle_exec)
        if seq_aln is not None:
            aln_map = {x.id: i for i, x in enumerate(seq_aln)}
            for i, r in enumerate(result.results, start=1):
                idx = aln_map[r.sequence_id.replace(' ', '_')]
                seq = str(seq_aln[idx].seq)
                r.annotations[field_map[f]] = seq
                result.log['%s-%s' % (f, r.sequence_id)] = seq
        else:
            result.valid = False

    #for r in result.results:  print r.annotations
    return result
Exemple #2
0
def alignBlocks(data, seq_fields, muscle_exec=default_muscle_exec):
    """
    Multiple aligns blocks of sequence fields together

    Arguments:
      data : a DbData object with IgRecords to process.
      seq_fields : the sequence fields to multiple align.
      muscle_exec : the MUSCLE executable.

    Returns:
      changeo.Multiprocessing.DbResult : object containing IgRecords with multiple aligned sequence fields.
    """
    # Define return object
    result = DbResult(data.id, data.data)
    result.results = data.data
    result.valid = True

    # Fail invalid groups
    if result.id is None:
        result.log = None
        result.valid = False
        return result

    seq_list = [SeqRecord(r.getSeqField(f), id='%s_%s' % (r.id, f)) for f in seq_fields \
                for r in data.data]
    seq_aln = runMuscle(seq_list, aligner_exec=muscle_exec)
    if seq_aln is not None:
        aln_map = {x.id: i for i, x in enumerate(seq_aln)}
        for i, r in enumerate(result.results, start=1):
            for f in seq_fields:
                idx = aln_map['%s_%s' % (r.id, f)]
                seq = str(seq_aln[idx].seq)
                r.annotations['%s_ALIGN' % f] = seq
                result.log['%s-%s' % (f, r.id)] = seq

    else:
        result.valid = False

    #for r in result.results:  print r.annotations
    return result
def alignWithin(data, field_map, muscle_exec=default_muscle_exec):
    """
    Multiple aligns sequence fields within a row

    Arguments:
      data : DbData object with Receptor objects to process.
      field_map : a dictionary of {input sequence : output sequence) field names to multiple align.
      muscle_exec : the MUSCLE executable.

    Returns:
      changeo.Multiprocessing.DbResult : object containing Receptor objects with multiple aligned sequence fields.
    """
    # Define return object
    result = DbResult(data.id, data.data)
    result.results = data.data
    result.valid = True

    # Fail invalid groups
    if result.id is None:
        result.log = None
        result.valid = False
        return result

    record = data.data
    seq_fields = list(field_map.keys())
    seq_list = [SeqRecord(record.getSeq(f), id=f) for f in seq_fields]
    seq_aln = runMuscle(seq_list, aligner_exec=muscle_exec)
    if seq_aln is not None:
        aln_map = {x.id: i for i, x in enumerate(seq_aln)}
        for f in seq_fields:
            idx = aln_map[f]
            seq = str(seq_aln[idx].seq)
            record.annotations[field_map[f]] = seq
            result.log[f] = seq
    else:
        result.valid = False

    return result
Exemple #4
0
def alignWithin(data, seq_fields, muscle_exec=default_muscle_exec):
    """
    Multiple aligns sequence fields within a row

    Arguments:
      data : a DbData object with an IgRecords to process.
      seq_fields : the sequence fields to multiple align.
      muscle_exec : the MUSCLE executable.

    Returns:
      changeo.Multiprocessing.DbResult : object containing IgRecords with multiple aligned sequence fields.
    """
    # Define return object
    result = DbResult(data.id, data.data)
    result.results = data.data
    result.valid = True

    # Fail invalid groups
    if result.id is None:
        result.log = None
        result.valid = False
        return result

    record = data.data
    seq_list = [SeqRecord(record.getSeqField(f), id=f) for f in seq_fields]
    seq_aln = runMuscle(seq_list, aligner_exec=muscle_exec)
    if seq_aln is not None:
        aln_map = {x.id: i for i, x in enumerate(seq_aln)}
        for f in seq_fields:
            idx = aln_map[f]
            seq = str(seq_aln[idx].seq)
            record.annotations['%s_ALIGN' % f] = seq
            result.log[f] = seq
    else:
        result.valid = False

    return result
def processQueueClust(alive, data_queue, result_queue, clone_func, clone_args):
    """
    Pulls from data queue, performs calculations, and feeds results queue

    Arguments: 
    alive = a multiprocessing.Value boolean controlling whether processing continues
            if False exit process
    data_queue = a multiprocessing.Queue holding data to process
    result_queue = a multiprocessing.Queue to hold processed results
    clone_func = the function to call for calculating pairwise distances between sequences
    clone_args = a dictionary of arguments to pass to clone_func

    Returns: 
    None
    """
    
    try:
        # print 'START WORK', alive.value
        # Iterator over data queue until sentinel object reached
        while alive.value:
            # Get data from queue
            if data_queue.empty():  continue
            else:  data = data_queue.get()
            # Exit upon reaching sentinel
            if data is None:  break
            # print "WORK", alive.value, data['id']

            # Define result object for iteration and get data records
            records = data.data
            result = DbResult(data.id, records)
             
            # Create row of distance matrix and check for error
            dist_row = clone_func(records, **clone_args) if data else None
            if dist_row is not None:
                result.results = dist_row
                result.valid = True
  
            # Feed results to result queue
            result_queue.put(result)
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None
    except:
        #sys.stderr.write('Exception in worker\n')
        alive.value = False
        raise
    
    return None
Exemple #6
0
def processQueueClust(alive, data_queue, result_queue, clone_func, clone_args):
    """
    Pulls from data queue, performs calculations, and feeds results queue

    Arguments: 
    alive = a multiprocessing.Value boolean controlling whether processing continues
            if False exit process
    data_queue = a multiprocessing.Queue holding data to process
    result_queue = a multiprocessing.Queue to hold processed results
    clone_func = the function to call for calculating pairwise distances between sequences
    clone_args = a dictionary of arguments to pass to clone_func

    Returns: 
    None
    """

    try:
        # print 'START WORK', alive.value
        # Iterator over data queue until sentinel object reached
        while alive.value:
            # Get data from queue
            if data_queue.empty(): continue
            else: data = data_queue.get()
            # Exit upon reaching sentinel
            if data is None: break
            # print "WORK", alive.value, data['id']

            # Define result object for iteration and get data records
            records = data.data
            result = DbResult(data.id, records)

            # Create row of distance matrix and check for error
            dist_row = clone_func(records, **clone_args) if data else None
            if dist_row is not None:
                result.results = dist_row
                result.valid = True

            # Feed results to result queue
            result_queue.put(result)
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None
    except:
        #sys.stderr.write('Exception in worker\n')
        alive.value = False
        raise

    return None
def processQueue(alive, data_queue, result_queue, clone_func, clone_args):
    """
    Pulls from data queue, performs calculations, and feeds results queue

    Arguments: 
    alive = a multiprocessing.Value boolean controlling whether processing continues
            if False exit process
    data_queue = a multiprocessing.Queue holding data to process
    result_queue = a multiprocessing.Queue to hold processed results
    clone_func = the function to call for clonal assignment
    clone_args = a dictionary of arguments to pass to clone_func

    Returns: 
    None
    """
    try:
        # Iterator over data queue until sentinel object reached
        while alive.value:
            # Get data from queue
            if data_queue.empty():  continue
            else:  data = data_queue.get()
            # Exit upon reaching sentinel
            if data is None:  break

            # Define result object for iteration and get data records
            records = data.data
            result = DbResult(data.id, records)

            # Check for invalid data (due to failed indexing) and add failed result
            if not data:
                result_queue.put(result)
                continue

            # Add V(D)J to log
            result.log['ID'] = ','.join([str(x) for x in data.id])
            result.log['VALLELE'] = ','.join(set([(r.getVAllele() or '') for r in records]))
            result.log['DALLELE'] = ','.join(set([(r.getDAllele() or '') for r in records]))
            result.log['JALLELE'] = ','.join(set([(r.getJAllele() or '') for r in records]))
            result.log['JUNCLEN'] = ','.join(set([(str(len(r.junction)) or '0') for r in records]))
            result.log['SEQUENCES'] = len(records)
             
            # Checking for preclone failure and assign clones
            clones = clone_func(records, **clone_args) if data else None

            # import cProfile
            # prof = cProfile.Profile()
            # clones = prof.runcall(clone_func, records, **clone_args)
            # prof.dump_stats('worker-%d.prof' % os.getpid())

            if clones is not None:
                result.results = clones
                result.valid = True
                result.log['CLONES'] = len(clones)
            else:
                result.log['CLONES'] = 0
  
            # Feed results to result queue
            result_queue.put(result)
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None
    except:
        #sys.stderr.write('Exception in worker\n')
        alive.value = False
        raise
    
    return None
Exemple #8
0
def processQueue(alive,
                 data_queue,
                 result_queue,
                 max_missing=default_max_missing,
                 clone_func=distanceClones,
                 clone_args={}):
    """
    Pulls from data queue, performs calculations, and feeds results queue

    Arguments: 
      alive : a multiprocessing.Value boolean controlling whether processing continues
              if False exit process
      data_queue : a multiprocessing.Queue holding data to process
      result_queue : a multiprocessing.Queue to hold processed results
      max_missing : maximum number of non-ACGT characters to allow in the junction sequence.
      clone_func : the function to call for clonal assignment
      clone_args : a dictionary of arguments to pass to clone_func

    Returns: 
      None
    """
    try:
        # Iterator over data queue until sentinel object reached
        while alive.value:
            # Get data from queue
            if data_queue.empty(): continue
            else: data = data_queue.get()
            # Exit upon reaching sentinel
            if data is None: break

            # Define result object for iteration and get data records
            result = DbResult(data.id, data.data)

            # Check for invalid data (due to failed indexing) and add failed result
            if not data:
                result_queue.put(result)
                continue

            # Filter records based on missing content
            seq_field = clone_args[
                'seq_field'] if 'seq_field' in clone_args else 'JUNCTION'
            filtered = filterMissing(data.data,
                                     field=seq_field,
                                     max_missing=max_missing)
            records = filtered['pass']
            result.failed = filtered['fail']

            # Add V(D)J to log
            result.log['ID'] = ','.join([str(x) for x in data.id])
            result.log['VALLELE'] = ','.join(
                set([(r.getVAllele() or '') for r in data.data]))
            result.log['DALLELE'] = ','.join(
                set([(r.getDAllele() or '') for r in data.data]))
            result.log['JALLELE'] = ','.join(
                set([(r.getJAllele() or '') for r in data.data]))
            result.log['JUNCLEN'] = ','.join(
                set([(str(len(r.junction)) or '0') for r in data.data]))
            result.log['PASSCOUNT'] = len(records)
            result.log['FAILCOUNT'] = len(result.failed)

            # Checking for preclone failure and assign clones
            clones = clone_func(records, **clone_args) if records else None

            # import cProfile
            # prof = cProfile.Profile()
            # clones = prof.runcall(clone_func, records, **clone_args)
            # prof.dump_stats('worker-%d.prof' % os.getpid())

            if clones is not None:
                result.results = clones
                result.valid = True
                result.log['CLONES'] = len(clones)
            else:
                result.log['CLONES'] = 0

            # Feed results to result queue
            result_queue.put(result)
        else:
            sys.stderr.write('PID %s:  Error in sibling process detected. Cleaning up.\n' \
                             % os.getpid())
            return None
    except:
        #sys.stderr.write('Exception in worker\n')
        alive.value = False
        raise

    return None