Esempio n. 1
0
def distAdemokun2011(records):
    """
    Calculate pairwise distances as defined in Ademokun 2011
    
    Arguments:
    records = list of IgRecords where first is query to be compared to others in list
    
    Returns:
    list of distances
    """
    # Pull out query sequence and V family information
    query = records.popitem(last=False)
    query_cdr3 = query.junction[3:-3]
    query_v_family = query.getVFamily()
    # Create alignment scoring dictionary
    score_dict = getDNAScoreDict()

    scores = [0] * len(records)
    for i in range(len(records)):

        if abs(len(query_cdr3) - len(records[i].junction[3:-3])) > 10:
            scores[i] = 1
        elif query_v_family != records[i].getVFamily():
            scores[i] = 1
        else:
            ld = pairwise2.align.globalds(query_cdr3,
                                          records[i].junction[3:-3],
                                          score_dict,
                                          -1,
                                          -1,
                                          one_alignment_only=True)
            scores[i] = ld / min(len(records[i].junction[3:-3]), query_cdr3)

    return scores
def distAdemokun2011(records):
    """
    Calculate pairwise distances as defined in Ademokun 2011
    
    Arguments:
    records = list of IgRecords where first is query to be compared to others in list
    
    Returns:
    list of distances
    """
    # Pull out query sequence and V family information
    query = records.popitem(last=False)
    query_cdr3 = query.junction[3:-3]
    query_v_family = query.getVFamily()
    # Create alignment scoring dictionary
    score_dict = getDNAScoreDict()
    
    scores = [0]*len(records)    
    for i in range(len(records)):
        
        if abs(len(query_cdr3) - len(records[i].junction[3:-3])) > 10:
            scores[i] = 1
        elif query_v_family != records[i].getVFamily(): 
            scores[i] = 1
        else: 
            ld = pairwise2.align.globalds(query_cdr3, records[i].junction[3:-3], 
                                          score_dict, -1, -1, one_alignment_only=True)
            scores[i] = ld/min(len(records[i].junction[3:-3]), query_cdr3)
    
    return scores
def countMismatches(seq_list, ref_seq, ignore_chars=default_missing_chars, 
                    score_dict=getDNAScoreDict(mask_score=(1, 1), gap_score=(1, 1))):
    """
    Counts the occurrence of nucleotide mismatches in a set of sequences

    Arguments: 
    seq_list = a list of SeqRecord objects with aligned sequences
    ref_seq = a SeqRecord object containing the reference sequence to match against
    ignore_chars = list of characters to exclude from mismatch counts
    score_dict = optional dictionary of alignment scores as {(char1, char2): score}

    Returns: 
    a dictionary of pandas.DataFrame objects containing [mismatch, qsum, total] counts  
    for {pos:sequence position, nuc:nucleotide pairs, qual:quality score, set:sequence set} 
    """
    # Define position mismatch DataFrame
    pos_max = max([len(s) for s in seq_list])
    pos_df = pd.DataFrame(0, index=list(range(pos_max)), 
                          columns=['mismatch', 'q_sum', 'total'], dtype=float)
    # Define nucleotide mismatch DataFrame
    nuc_pairs = list(permutations(['A', 'C', 'G', 'T'], 2))
    nuc_df = pd.DataFrame(0, index=pd.MultiIndex.from_tuples(nuc_pairs, names=['obs', 'ref']), 
                          columns=['mismatch', 'q_sum', 'total'], dtype=float)
    # Define quality mismatch DataFrame
    qual_df = pd.DataFrame(0, index=list(range(94)), 
                           columns=['mismatch', 'q_sum', 'total'], dtype=float)    

    # Iterate over seq_list and count mismatches
    for seq in seq_list:
        qual = seq.letter_annotations['phred_quality']
        for i, b in enumerate(seq):
            a = ref_seq[i]
            q = qual[i]
            # Update total counts and qualities
            if {a, b}.isdisjoint(ignore_chars):  
                pos_df.ix[i, 'total'] += 1
                pos_df.ix[i, 'q_sum'] += q
                nuc_df.ix[b]['total'] += 1
                nuc_df.ix[b]['q_sum'] += q
                qual_df.ix[q, 'total'] += 1
                qual_df.ix[q, 'q_sum'] += q
            # Update mismatch counts
            if score_dict[(a, b)] == 0:
                pos_df.ix[i, 'mismatch'] += 1
                nuc_df.ix[(a, b), 'mismatch'] += 1
                qual_df.ix[q, 'mismatch'] += 1

    # Define set total mismatch DataFrame
    set_df = pd.DataFrame([pos_df.sum(axis=0)], index=[len(seq_list)], 
                          columns=['mismatch', 'q_sum', 'total'], dtype=float)

    return {'pos':pos_df, 'nuc':nuc_df, 'qual':qual_df, 'set':set_df}
Esempio n. 4
0
    def test_localAlignment(self):
        score_dict = getDNAScoreDict(mask_score=(0, 1), gap_score=(0, 0))

        # N character tests
        print('TEST Ns>')
        align = [
            localAlignment(x,
                           self.primers_n,
                           max_error=0.2,
                           score_dict=score_dict) for x in self.records_n
        ]
        for x in align:
            print('  %s>' % x.seq.id)
            print('      SEQ> %s' % x.seq.seq)
            print('  ALN-SEQ> %s' % x.align_seq)
            print('   ALN-PR> %s' % x.align_primer)
            print('   PRIMER> %s' % x.primer)
            print('    START> %s' % x.start)
            print('      END> %s' % x.end)
            print('     GAPS> %i' % x.gaps)
            print('    ERROR> %f\n' % x.error)

        self.assertListEqual([(x, round(y, 4)) for x, y in self.align_n],
                             [(x.primer, round(x.error, 4)) for x in align])

        # Indel tests
        print('TEST INDELS>')
        align = [
            localAlignment(x,
                           self.primers_indel,
                           max_error=0.2,
                           gap_penalty=(1, 1)) for x in self.records_indel
        ]
        for x in align:
            print('  %s>' % x.seq.id)
            print('      SEQ> %s' % x.seq.seq)
            print('  ALN-SEQ> %s' % x.align_seq)
            print('   ALN-PR> %s' % x.align_primer)
            print('   PRIMER> %s' % x.primer)
            print('    START> %s' % x.start)
            print('      END> %s' % x.end)
            print('     GAPS> %i' % x.gaps)
            print('    ERROR> %f\n' % x.error)

        self.assertListEqual([(x, round(y, 4)) for x, y in self.align_indel],
                             [(x.primer, round(x.error, 4)) for x in align])
Esempio n. 5
0
    def test_scoreSeqPair(self):
        # Default scoring
        scores = [scoreSeqPair(x, y) for x, y in self.seq_pairs]
        print('Default DNA Scores>')
        for (x, y), s in zip(self.seq_pairs, scores):
            print('    %s> %s' % (x.id, x.seq))
            print('    %s> %s' % (y.id, y.seq))
            print('   SCORE> %i' % s[0])
            print('  WEIGHT> %i' % s[1])
            print('   ERROR> %f\n' % s[2])

        self.assertSequenceEqual([round(s[2], 4) for s in scores],
                                 [round(s, 4) for s in self.error_dna_def])

        # Asymmetric scoring without position masking
        score_dict = getDNAScoreDict(mask_score=(0, 1), gap_score=(0, 1))
        scores = [scoreSeqPair(x, y, score_dict=score_dict) \
                  for x, y in self.seq_pairs]
        print('Asymmetric DNA Scores>')
        for (x, y), s in zip(self.seq_pairs, scores):
            print('    %s> %s' % (x.id, x.seq))
            print('    %s> %s' % (y.id, y.seq))
            print('   SCORE> %i' % s[0])
            print('  WEIGHT> %i' % s[1])
            print('   ERROR> %f\n' % s[2])

        self.assertSequenceEqual([round(s[2], 4) for s in scores],
                                 [round(s, 4) for s in self.error_dna_asym])

        # Symmetric scoring with N positions excluded
        ignore_chars = set(['n', 'N'])
        scores = [scoreSeqPair(x, y, ignore_chars=ignore_chars) \
                  for x, y in self.seq_pairs]
        print('Masked DNA Scores>')
        for (x, y), s in zip(self.seq_pairs, scores):
            print('    %s> %s' % (x.id, x.seq))
            print('    %s> %s' % (y.id, y.seq))
            print('   SCORE> %i' % s[0])
            print('  WEIGHT> %i' % s[1])
            print('   ERROR> %f\n' % s[2])

        self.assertSequenceEqual([round(s[2], 4) for s in scores],
                                 [round(s, 4) for s in self.error_dna_mask])
Esempio n. 6
0
    def test_scoreAlignment(self):
        score_dict = getDNAScoreDict(mask_score=(0, 1), gap_score=(0, 0))
        align = [
            scoreAlignment(x, self.primers_n, start=1, score_dict=score_dict)
            for x in self.records_n
        ]
        for x in align:
            print('  %s>' % x.seq.id)
            print('      SEQ> %s' % x.seq.seq)
            print('  ALN-SEQ> %s' % x.align_seq)
            print('   ALN-PR> %s' % x.align_primer)
            print('   PRIMER> %s' % x.primer)
            print('    START> %s' % x.start)
            print('      END> %s' % x.end)
            print('     GAPS> %i' % x.gaps)
            print('    ERROR> %f\n' % x.error)

        self.assertListEqual([(x, round(y, 4)) for x, y in self.score_n],
                             [(x.primer, round(x.error, 4)) for x in align])
Esempio n. 7
0
def distChen2010(records):
    """
    Calculate pairwise distances as defined in Chen 2010
    
    Arguments:
    records = list of IgRecords where first is query to be compared to others in list
    
    Returns:
    list of distances
    """
    # Pull out query sequence and V/J information
    query = records.popitem(last=False)
    query_cdr3 = query.junction[3:-3]
    query_v_allele = query.getVAllele()
    query_v_gene = query.getVGene()
    query_v_family = query.getVFamily()
    query_j_allele = query.getJAllele()
    query_j_gene = query.getJGene()
    # Create alignment scoring dictionary
    score_dict = getDNAScoreDict()

    scores = [0] * len(records)
    for i in range(len(records)):
        ld = pairwise2.align.globalds(query_cdr3,
                                      records[i].junction[3:-3],
                                      score_dict,
                                      -1,
                                      -1,
                                      one_alignment_only=True)
        # Check V similarity
        if records[i].getVAllele() == query_v_allele: ld += 0
        elif records[i].getVGene() == query_v_gene: ld += 1
        elif records[i].getVFamily() == query_v_family: ld += 3
        else: ld += 5
        # Check J similarity
        if records[i].getJAllele() == query_j_allele: ld += 0
        elif records[i].getJGene() == query_j_gene: ld += 1
        else: ld += 3
        # Divide by length
        scores[i] = ld / max(len(records[i].junction[3:-3]), query_cdr3)

    return scores
def distChen2010(records):
    """
    Calculate pairwise distances as defined in Chen 2010
    
    Arguments:
    records = list of IgRecords where first is query to be compared to others in list
    
    Returns:
    list of distances
    """
    # Pull out query sequence and V/J information
    query = records.popitem(last=False)
    query_cdr3 = query.junction[3:-3]
    query_v_allele = query.getVAllele()
    query_v_gene = query.getVGene()
    query_v_family = query.getVFamily()
    query_j_allele = query.getJAllele()
    query_j_gene = query.getJGene()
    # Create alignment scoring dictionary
    score_dict = getDNAScoreDict()
    
    scores = [0]*len(records)    
    for i in range(len(records)):
        ld = pairwise2.align.globalds(query_cdr3, records[i].junction[3:-3],
                                      score_dict, -1, -1, one_alignment_only=True)
        # Check V similarity
        if records[i].getVAllele() == query_v_allele: ld += 0
        elif records[i].getVGene() == query_v_gene: ld += 1
        elif records[i].getVFamily() == query_v_family: ld += 3
        else: ld += 5
        # Check J similarity
        if records[i].getJAllele() == query_j_allele: ld += 0
        elif records[i].getJGene() == query_j_gene: ld += 1
        else: ld += 3
        # Divide by length
        scores[i] = ld/max(len(records[i].junction[3:-3]), query_cdr3)
        
    return scores
Esempio n. 9
0
def alignPrimers(data,
                 primers,
                 primers_regex=None,
                 max_error=default_primer_max_error,
                 max_len=default_primer_max_len,
                 rev_primer=False,
                 skip_rc=False,
                 mode='mask',
                 barcode=False,
                 barcode_field=default_barcode_field,
                 primer_field=default_primer_field,
                 gap_penalty=default_primer_gap_penalty,
                 score_dict=getDNAScoreDict(mask_score=(0, 1),
                                            gap_score=(0, 0)),
                 delimiter=default_delimiter):
    """
    Performs pairwise local alignment of a list of short sequences against a long sequence

    Arguments:
      data : SeqData object containing a single SeqRecord object to process.
      primers : dictionary of {names: short IUPAC ambiguous sequence strings}.
      primers_regex : optional dictionary of {names: compiled primer regular expressions}.
      max_error : maximum acceptable error rate for a valid alignment.
      max_len : maximum length of sample sequence to align.
      rev_primer : if True align with the tail end of the sequence.
      skip_rc : if True do not check reverse complement sequences.
      mode : defines the action taken; one of 'cut', 'mask', 'tag' or 'trim'.
      barcode : if True add sequence preceding primer to description.
      barcode_field : name of the output barcode annotation.
      primer_field : name of the output primer annotation.
      gap_penalty : a tuple of positive (gap open, gap extend) penalties.
      score_dict : optional dictionary of {(char1, char2): score} alignment scores
      delimiter : a tuple of delimiters for (annotations, field/values, value lists).

    Returns:
      presto.Multiprocessing.SeqResult: result object.
    """
    # Define result object
    result = SeqResult(data.id, data.data)

    # Align primers
    align = localAlignment(data.data,
                           primers,
                           primers_regex=primers_regex,
                           max_error=max_error,
                           max_len=max_len,
                           rev_primer=rev_primer,
                           skip_rc=skip_rc,
                           gap_penalty=gap_penalty,
                           score_dict=score_dict)
    if not align:
        # Update log if no alignment
        result.log['ALIGN'] = None
        return result

    # Create output sequence
    out_seq = maskSeq(align,
                      mode=mode,
                      barcode=barcode,
                      barcode_field=barcode_field,
                      primer_field=primer_field,
                      delimiter=delimiter)
    result.results = out_seq
    result.valid = bool(
        align.error <= max_error) if len(out_seq) > 0 else False

    # Update log with successful alignment results
    result.log['SEQORIENT'] = out_seq.annotations['seqorient']
    result.log['PRIMER'] = align.primer
    result.log['PRORIENT'] = 'RC' if align.rev_primer else 'F'
    result.log['PRSTART'] = align.start
    if 'barcode' in out_seq.annotations:
        result.log['BARCODE'] = out_seq.annotations['barcode']
    if not align.rev_primer:
        align_cut = len(align.align_seq) - align.gaps
        result.log['INSEQ'] = align.align_seq + \
                              str(align.seq.seq[align_cut:])
        result.log['ALIGN'] = align.align_primer
        result.log['OUTSEQ'] = str(
            out_seq.seq).rjust(len(result.data.seq) + align.gaps)
    else:
        align_cut = len(align.seq) - len(align.align_seq) + align.gaps
        result.log['INSEQ'] = str(align.seq.seq[:align_cut]) + align.align_seq
        result.log['ALIGN'] = align.align_primer.rjust(
            len(result.data.seq) + align.gaps)
        result.log['OUTSEQ'] = str(out_seq.seq)
    result.log['ERROR'] = align.error

    return result
Esempio n. 10
0
def maskPrimers(seq_file,
                primer_file,
                align_func,
                align_args={},
                out_file=None,
                out_args=default_out_args,
                nproc=None,
                queue_size=None):
    """
    Masks or cuts primers from sample sequences using local alignment

    Arguments: 
      seq_file : name of file containing sample sequences.
      primer_file : name of the file containing primer sequences.
      align_func : the function to call for alignment.
      align_arcs : a dictionary of arguments to pass to align_func.
      out_file : output file name. Automatically generated from the input file if None.
      out_args : common output argument dictionary from parseCommonArgs.
      nproc : the number of processQueue processes;
              if None defaults to the number of CPUs.
      queue_size : maximum size of the argument queue;
                   if None defaults to 2*nproc.
                 
    Returns:
      list: a list of successful output file names.
    """
    # Define subcommand label dictionary
    cmd_dict = {
        alignPrimers: 'align',
        scorePrimers: 'score',
        extractPrimers: 'extract'
    }

    # Print parameter info
    log = OrderedDict()
    log['START'] = 'MaskPrimers'
    log['COMMAND'] = cmd_dict.get(align_func, align_func.__name__)
    log['SEQ_FILE'] = os.path.basename(seq_file)
    if primer_file is not None:
        log['PRIMER_FILE'] = os.path.basename(primer_file)
    if 'mode' in align_args: log['MODE'] = align_args['mode']
    if 'max_error' in align_args: log['MAX_ERROR'] = align_args['max_error']
    if 'start' in align_args: log['START_POS'] = align_args['start']
    if 'length' in align_args: log['LENGTH'] = align_args['length']
    if 'max_len' in align_args: log['MAX_LEN'] = align_args['max_len']
    if 'rev_primer' in align_args: log['REV_PRIMER'] = align_args['rev_primer']
    if 'skip_rc' in align_args: log['SKIP_RC'] = align_args['skip_rc']
    if 'gap_penalty' in align_args:
        log['GAP_PENALTY'] = ', '.join(
            [str(x) for x in align_args['gap_penalty']])
    if 'barcode' in align_args:
        log['BARCODE'] = align_args['barcode']
    if 'barcode' in align_args and align_args['barcode']:
        log['BARCODE_FIELD'] = align_args['barcode_field']
    log['PRIMER_FIELD'] = align_args['primer_field']
    log['NPROC'] = nproc
    printLog(log)

    # Define alignment arguments and compile primers for align mode
    if primer_file is not None:
        primers = readPrimerFile(primer_file)
        if 'rev_primer' in align_args and align_args['rev_primer']:
            primers = {k: reverseComplement(v) for k, v in primers.items()}
        align_args['primers'] = primers
        align_args['score_dict'] = getDNAScoreDict(mask_score=(0, 1),
                                                   gap_score=(0, 0))
    if align_func is alignPrimers:
        align_args['primers_regex'] = compilePrimers(primers)
    align_args['delimiter'] = out_args['delimiter']

    # Define feeder function and arguments
    feed_func = feedSeqQueue
    feed_args = {'seq_file': seq_file}
    # Define worker function and arguments
    work_func = processSeqQueue
    work_args = {'process_func': align_func, 'process_args': align_args}
    # Define collector function and arguments
    collect_func = collectSeqQueue
    collect_args = {
        'seq_file': seq_file,
        'label': 'primers',
        'out_file': out_file,
        'out_args': out_args
    }

    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, feed_args,
                             work_args, collect_args, nproc, queue_size)

    # Print log
    result['log']['END'] = 'MaskPrimers'
    printLog(result['log'])

    return result['out_files']
Esempio n. 11
0
def scorePrimers(data,
                 primers,
                 max_error=default_primer_max_error,
                 start=default_primer_start,
                 rev_primer=False,
                 mode='mask',
                 barcode=False,
                 barcode_field=default_barcode_field,
                 primer_field=default_primer_field,
                 score_dict=getDNAScoreDict(mask_score=(0, 1),
                                            gap_score=(0, 0)),
                 delimiter=default_delimiter):
    """
    Performs a simple fixed position alignment of primers

    Arguments:
      data : SeqData object containing a single SeqRecord object to process.
      primers : dictionary of {names: short IUPAC ambiguous sequence strings}.
      max_error : maximum acceptable error rate for a valid alignment
      start : position where primer alignment starts.
      rev_primer : if True align with the tail end of the sequence.
      mode : defines the action taken; one of 'cut', 'mask', 'tag' or 'trim'.
      barcode : if True add sequence preceding primer to description.
      barcode_field : name of the output barcode annotation.
      primer_field : name of the output primer annotation.
      score_dict : optional dictionary of {(char1, char2): score} alignment scores
      delimiter : a tuple of delimiters for (annotations, field/values, value lists).

    Returns:
      presto.Multiprocessing.SeqResult: result object.
    """
    # Define result object
    result = SeqResult(data.id, data.data)

    # Align primers
    align = scoreAlignment(data.data,
                           primers,
                           start=start,
                           rev_primer=rev_primer,
                           score_dict=score_dict)
    if not align:
        # Update log if no alignment
        result.log['ALIGN'] = None
        return result

    # Create output sequence
    out_seq = maskSeq(align,
                      mode=mode,
                      barcode=barcode,
                      barcode_field=barcode_field,
                      primer_field=primer_field,
                      delimiter=delimiter)
    result.results = out_seq
    result.valid = bool(
        align.error <= max_error) if len(out_seq) > 0 else False

    # Update log with successful alignment results
    result.log['PRIMER'] = align.primer
    result.log['PRORIENT'] = 'RC' if align.rev_primer else 'F'
    result.log['PRSTART'] = align.start
    if 'barcode' in out_seq.annotations:
        result.log['BARCODE'] = out_seq.annotations['barcode']
    if not align.rev_primer:
        align_cut = len(align.align_seq) - align.gaps
        result.log['INSEQ'] = align.align_seq + \
                              str(align.seq.seq[align_cut:])
        result.log['ALIGN'] = align.align_primer
        result.log['OUTSEQ'] = str(
            out_seq.seq).rjust(len(result.data.seq) + align.gaps)
    else:
        align_cut = len(align.seq) - len(align.align_seq) + align.gaps
        result.log['INSEQ'] = str(align.seq.seq[:align_cut]) + align.align_seq
        result.log['ALIGN'] = align.align_primer.rjust(
            len(result.data.seq) + align.gaps)
        result.log['OUTSEQ'] = str(out_seq.seq)
    result.log['ERROR'] = align.error

    return result
Esempio n. 12
0
def countMismatches(seq_list,
                    ref_seq,
                    ignore_chars=default_missing_chars,
                    score_dict=getDNAScoreDict(mask_score=(1, 1),
                                               gap_score=(1, 1)),
                    headers=default_headers,
                    distance_types=default_distance_types,
                    bin_count=default_bin_count):
    """
    Counts the occurrence of nucleotide mismatches in a set of sequences

    Arguments: 
      seq_list : a list of SeqRecord objects with aligned sequences
      ref_seq : a SeqRecord object containing the reference sequence to match against
      ignore_chars : list of characters to exclude from mismatch counts
      score_dict : optional dictionary of alignment scores as {(char1, char2): score}
      headers : distance DataFrame headers.
      distance_types : distance types to include.
      bin_count : histogram bin count.

    Returns: 
      dict: dictionaries containing [mismatch, qsum, total] counts
            for {pos:sequence position, nuc:nucleotide pairs, qual:quality score, set:sequence set, dist:sequence distances}
    """

    # Define position mismatch DataFrame
    mismatch = initializeMismatchDictionary(len(ref_seq),
                                            headers=headers,
                                            distance_types=distance_types,
                                            bin_count=bin_count)

    for seq in seq_list:
        qual = seq.letter_annotations['phred_quality']
        for i, b in enumerate(seq):
            a = ref_seq[i]
            q = qual[i]

            if a not in ignore_chars and b not in ignore_chars:
                mismatch['pos']['total'][i] += 1
                mismatch['pos']['q_sum'][i] += q

                # Add nt counts, including for mismatches
                mismatch['nuc']['mismatch'][b][b] += 1
                for a_i in mismatch['nuc']['total'][b]:
                    mismatch['nuc']['total'][b][a_i] += 1
                for a_i in mismatch['nuc']['q_sum'][b]:
                    mismatch['nuc']['q_sum'][b][a_i] += q

                mismatch['qual']['total'][q] += 1
                mismatch['qual']['q_sum'][q] += q

            if score_dict[(a, b)] == 0:
                mismatch['pos']['mismatch'][i] += 1
                mismatch['nuc']['mismatch'][a][b] += 1
                #@ Remove nt if mismatch from previous count
                mismatch['nuc']['mismatch'][b][b] -= 1
                mismatch['qual']['mismatch'][q] += 1

    # Generate the set counter (for a given number of sequences in umi group, these are the mismatch values)
    mismatch['set'] = {
        header: {
            len(seq_list): sum(mismatch['pos'][header].values())
        }
        for header in headers
    }

    # Calculate distances
    distance_mismatch = calculateDistances(seq_list, bin_count=bin_count)
    mismatch['dist'] = {
        header: distance_mismatch[header]
        for header in distance_types
    }

    return mismatch
def alignAssembly(head_seq, tail_seq, alpha=default_alpha, max_error=default_max_error,
                  min_len=default_min_len, max_len=default_max_len, scan_reverse=False,
                  assembly_stats=None, score_dict=getDNAScoreDict(mask_score=(1, 1), gap_score=(0, 0))):
    """
    Stitches two sequences together by aligning the ends

    Arguments:
    head_seq = the head SeqRecord
    head_seq = the tail SeqRecord
    alpha = the minimum p-value for a valid assembly
    max_error = the maximum error rate for a valid assembly
    min_len = minimum length of overlap to test
    max_len = maximum length of overlap to test
    scan_reverse = if True allow the head sequence to overhang the end of the tail sequence
                   if False end alignment scan at end of tail sequence or start of head sequence
    assembly_stats = optional successes by trials numpy.array of p-values
    score_dict = optional dictionary of character scores in the 
                 form {(char1, char2): score}
                     
    Returns: 
    an AssemblyRecord object
    """
    # Set alignment parameters
    if assembly_stats is None:  assembly_stats = AssemblyStats(max_len + 1)

    # Define general parameters
    head_str = str(head_seq.seq)
    tail_str = str(tail_seq.seq)
    head_len = len(head_str)
    tail_len = len(tail_str)

    # Determine if quality scores are present
    has_quality = hasattr(head_seq, 'letter_annotations') and \
                  hasattr(tail_seq, 'letter_annotations') and \
                  'phred_quality' in head_seq.letter_annotations and \
                  'phred_quality' in tail_seq.letter_annotations

    # Determine if sub-sequences are allowed and define scan range
    if scan_reverse and max_len >= min(head_len, tail_len):
        scan_len = head_len + tail_len - min_len
    else:
        scan_len = min(max(head_len, tail_len), max_len)

    # Iterate and score overlap segments
    stitch = AssemblyRecord()
    for i in range(min_len, scan_len + 1):
        a = max(0, head_len - i)
        b = head_len - max(0, i - tail_len)
        x = max(0, i - head_len)
        y = min(tail_len, i)
        score, weight, error = scoreSeqPair(head_str[a:b], tail_str[x:y], score_dict=score_dict)
        z = assembly_stats.z[score, weight]
        # Save stitch as optimal if z-score improves
        if z > stitch.zscore:
           stitch.head_pos = (a, b)
           stitch.tail_pos = (x, y)
           stitch.zscore = z
           stitch.pvalue = assembly_stats.p[score, weight]
           stitch.error = error

    # Build stitched sequences and assign best_dict values
    if stitch.head_pos is not None:
        # Correct quality scores and resolve conflicts
        a, b = stitch.head_pos
        x, y = stitch.tail_pos
        if has_quality:
            # Build quality consensus
            overlap_seq = overlapConsensus(head_seq[a:b], tail_seq[x:y])
        else:
            # Assign head sequence to conflicts when no quality information is available
            overlap_seq = head_seq[a:b]

        if a > 0 and y < tail_len:
            # Tail overlaps end of head
            stitch.seq = head_seq[:a] + overlap_seq + tail_seq[y:]
        elif b < head_len and x > 0:
            # Head overlaps end of tail
            stitch.seq = tail_seq[:x] + overlap_seq + head_seq[b:]
        elif a == 0 and b == head_len:
            # Head is a subsequence of tail
            stitch.seq = tail_seq[:x] + overlap_seq + tail_seq[y:]
        elif x == 0 and y == tail_len:
            # Tail is a subsequence of head
            stitch.seq = head_seq[:a] + overlap_seq + head_seq[b:]
        else:
            sys.stderr.write('ERROR:  Invalid overlap condition for %s\n' % head_seq.id)


        # Define best stitch ID
        stitch.seq.id = head_seq.id if head_seq.id == tail_seq.id \
                              else '+'.join([head_seq.id, tail_seq.id])
        stitch.seq.name = stitch.seq.id
        stitch.seq.description = ''

    stitch.valid = bool(stitch.pvalue <= alpha and stitch.error <= max_error)

    return stitch
def referenceAssembly(head_seq, tail_seq, ref_dict, ref_file, min_ident=default_min_ident,
                      evalue=default_evalue, max_hits=default_max_hits, fill=False,
                      usearch_exec=default_usearch_exec,
                      score_dict=getDNAScoreDict(mask_score=(1, 1), gap_score=(0, 0))):
    """
    Stitches two sequences together by aligning against a reference database

    Arguments:
    head_seq = the head SeqRecord
    head_seq = the tail SeqRecord
    ref_dict = a dictionary of reference SeqRecord objects
    ref_file = the path to the reference database file
    min_ident = the minimum identity for a valid assembly
    evalue = the E-value cut-off for ublast
    max_hits = the maxhits output limit for ublast
    fill = if False non-overlapping regions will be assigned Ns;
           if True non-overlapping regions will be filled with the reference sequence.
    usearch_exec = the path to the usearch executable
    score_dict = optional dictionary of character scores in the
                 form {(char1, char2): score}

    Returns:
    an AssemblyRecord object
    """
    # Define general parameters
    head_len = len(head_seq)
    tail_len = len(tail_seq)

    # Determine if quality scores are present
    has_quality = hasattr(head_seq, 'letter_annotations') and \
                  hasattr(tail_seq, 'letter_annotations') and \
                  'phred_quality' in head_seq.letter_annotations and \
                  'phred_quality' in tail_seq.letter_annotations

    # Align against reference
    head_df = runUBlastAlignment(head_seq, ref_file, evalue=evalue, max_hits=max_hits,
                                 usearch_exec=usearch_exec)
    tail_df = runUBlastAlignment(tail_seq, ref_file, evalue=evalue, max_hits=max_hits,
                                 usearch_exec=usearch_exec)

    # Subset results to matching reference assignments
    align_df = pd.merge(head_df, tail_df, on='target', how='inner', suffixes=('_head', '_tail'))

    # If no matching targets return failed results
    if len(align_df) < 1:
        return AssemblyRecord()

    # Select top alignment
    align_top = align_df.ix[0, :]
    ref_id = align_top['target']
    ref_seq = ref_dict[ref_id]

    # Get offset of target and reference positions
    head_shift = align_top['target_start_head'] - align_top['query_start_head']
    tail_shift = align_top['target_start_tail'] - align_top['query_start_tail']

    # Get positions of outer reference match in head (a, b) and tail (x, y) sequences
    outer_start = align_top[['target_start_head', 'target_start_tail']].min()
    outer_end = align_top[['target_end_head', 'target_end_tail']].max()
    a_outer = outer_start - head_shift
    b_outer = outer_end - head_shift
    x_outer = outer_start - tail_shift
    y_outer = outer_end - tail_shift

    # Get positions of inner reference match in head (a,b) and tail (x,y) sequences
    inner_start = align_top[['target_start_head', 'target_start_tail']].max()
    inner_end = align_top[['target_end_head', 'target_end_tail']].min()
    a_inner = inner_start - head_shift
    b_inner = inner_end - head_shift
    x_inner = inner_start - tail_shift
    y_inner = inner_end - tail_shift

    # Determine head (a, b) and tail (x, y) overlap positions
    a = max(0, a_inner - x_inner)
    b = min(b_inner + (tail_len - y_inner), head_len)
    x = max(0, x_inner - a_inner)
    y = min(y_inner + (head_len - b_inner), tail_len)

    # Join sequences if head and tail do not overlap, otherwise assemble
    if a > b and x > y:
        stitch = joinSeqPair(head_seq, tail_seq, gap=(a - b), insert_seq=None)
    else:
        stitch = AssemblyRecord()
        stitch.gap = 0

        # Define overlap sequence
        if has_quality:
            # Build quality consensus
            overlap_seq = overlapConsensus(head_seq[a:b], tail_seq[x:y])
        else:
            # Assign head sequence to conflicts when no quality information is available
            overlap_seq = head_seq[a:b]

        # Assemble sequence
        if a > 0 and y < tail_len:
            # Tail overlaps end of head
            stitch.seq = head_seq[:a] + overlap_seq + tail_seq[y:]
        elif b < head_len and x > 0:
            # Head overlaps end of tail
            stitch.seq = tail_seq[:x] + overlap_seq + head_seq[b:]
        elif a == 0 and b == head_len:
            # Head is a subsequence of tail
            stitch.seq = tail_seq[:x] + overlap_seq + tail_seq[y:]
        elif x == 0 and y == tail_len:
            # Tail is a subsequence of head
            stitch.seq = head_seq[:a] + overlap_seq + head_seq[b:]
        else:
            sys.stderr.write('ERROR:  Invalid overlap condition for %s\n' % head_seq.id)

        # Define stitch ID
        stitch.seq.id = head_seq.id if head_seq.id == tail_seq.id \
                                    else '+'.join([head_seq.id, tail_seq.id])
        stitch.seq.name = stitch.seq.id
        stitch.seq.description = ''

    # Assign position info
    stitch.head_pos = (a, b)
    stitch.tail_pos = (x, y)

    # Assign reference info
    stitch.ref_seq = ref_seq[outer_start:outer_end]
    stitch.ref_pos = (max(a_outer, x_outer), max(b_outer, y_outer))
    stitch.evalue = tuple(align_top[['evalue_head', 'evalue_tail']])

    # Calculate assembly error
    score, weight, error = scoreSeqPair(stitch.seq.seq[stitch.ref_pos[0]:stitch.ref_pos[1]],
                                        ref_seq.seq[outer_start:outer_end],
                                        score_dict=score_dict)
    stitch.ident = 1 - error
    stitch.valid = bool(stitch.ident >= min_ident)

    # Fill gap with reference if required
    if a > b and x > y and fill:
        insert_seq = ref_seq.seq[(b + head_shift):(a + head_shift)]
        insert_rec = joinSeqPair(head_seq, tail_seq, gap=(a - b), insert_seq=insert_seq)
        stitch.seq = insert_rec.seq

    return stitch
def alignPrimers(seq_record, primers, primers_regex=None, max_error=default_max_error,
                 max_len=default_max_len, rev_primer=False, skip_rc=False,
                 gap_penalty=default_gap_penalty,
                 score_dict=getDNAScoreDict(mask_score=(0, 1), gap_score=(0, 0))):
    """
    Performs pairwise local alignment of a list of short sequences against a long sequence

    Arguments: 
    seq_record = a SeqRecord object to align primers against
    primers = dictionary of {names: short IUPAC ambiguous sequence strings}
    primers_regex = optional dictionary of {names: compiled primer regular expressions}
    max_error = maximum acceptable error rate before aligning reverse complement
    max_len = maximum length of sample sequence to align
    rev_primer = if True align with the tail end of the sequence
    skip_rc = if True do not check reverse complement sequences
    gap_penalty = a tuple of positive (gap open, gap extend) penalties
    score_dict = optional dictionary of alignment scores as {(char1, char2): score}

    Returns:
    A PrimerAlignment object
    """
    # Defined undefined parameters
    if primers_regex is None:  primers_regex = compilePrimers(primers)
    seq_record = seq_record.upper()
    rec_len = len(seq_record)
    max_len = min(rec_len, max_len)

    # Create empty return object
    align = PrimerAlignment(seq_record)
    align.rev_primer = rev_primer
    
    # Define sequences to align and assign orientation tags
    if not skip_rc:
        seq_list = [seq_record, reverseComplement(seq_record)]
        seq_list[0].annotations['seqorient'] = 'F'
        seq_list[1].annotations['seqorient'] = 'RC'
    else:
        seq_list = [seq_record]
        seq_list[0].annotations['seqorient'] = 'F'
    
    # Assign primer orientation tags
    for rec in seq_list:
        rec.annotations['prorient'] = 'F' if not rev_primer else 'RC' 
    
    # Attempt regular expression match first
    for rec in seq_list:
        scan_seq = str(rec.seq)
        scan_seq = scan_seq[:max_len] if not rev_primer else scan_seq[-max_len:]
        for adpt_id, adpt_regex in primers_regex.items():
            adpt_match = adpt_regex.search(scan_seq)
            # Parse matches
            if adpt_match:
                align.seq = rec
                align.seq.annotations['primer'] = adpt_id
                align.primer = adpt_id
                align.align_seq = scan_seq
                align.align_primer = '-' * adpt_match.start(0) + \
                                     primers[adpt_id] + \
                                     '-' * (max_len - adpt_match.end(0))
                align.gaps = 0
                align.error = 0
                align.valid = True

                # Determine start and end positions
                if not rev_primer:
                    align.start = adpt_match.start(0)
                    align.end = adpt_match.end(0)
                else:
                    rev_pos = rec_len - max_len
                    align.start = adpt_match.start(0) + rev_pos
                    align.end = adpt_match.end(0) + rev_pos

                return align
    
    # Perform local alignment if regular expression match fails
    best_align, best_rec, best_adpt, best_error = None, None, None, None
    for rec in seq_list:
        this_align = dict()
        scan_seq = str(rec.seq)
        scan_seq = scan_seq[:max_len] if not rev_primer else scan_seq[-max_len:]
        for adpt_id, adpt_seq in primers.items():
            pw2_align = pairwise2.align.localds(scan_seq, adpt_seq, score_dict,
                                                -gap_penalty[0], -gap_penalty[1],
                                                one_alignment_only=True)
            if pw2_align:
                this_align.update({adpt_id: pw2_align[0]})
        if not this_align:  continue
        
        # Determine alignment with lowest error rate
        for x_adpt, x_align in this_align.items():
            x_error = 1.0 - x_align[2] / len(primers[x_adpt])
            #x_gaps = len(x_align[1]) - max_len
            #x_error = 1.0 - (x_align[2] + x_gaps) / primers[x_adpt])
            if best_error is None or x_error < best_error:
                best_align = this_align
                best_rec = rec
                best_adpt = x_adpt
                best_error = x_error
        
        # Skip rev_primer complement if forward sequence error within defined threshold
        if best_error <= max_error:  break

    # Set return object to lowest error rate alignment
    if best_align:
        # Define input alignment string and gap count
        align_primer = best_align[best_adpt][1]
        align_len = len(align_primer)
        align_gaps = align_len - max_len

        # Populate return object
        align.seq = best_rec
        align.primer = best_adpt
        align.align_seq = str(best_align[best_adpt][0])
        align.align_primer = align_primer
        align.gaps = align_gaps
        align.error = best_error
        align.valid = True

        # Determine start and end positions
        if not rev_primer:
            # TODO:  need to switch to an aligner that outputs start/end for both sequences in alignment
            align.start = align_len - len(align_primer.lstrip('-'))
            align.end = best_align[best_adpt][4] - align_gaps
        else:
            # Count position from tail and end gaps
            rev_pos = rec_len - align_len
            align.start = rev_pos + best_align[best_adpt][3] + align_gaps
            align.end = rev_pos + len(align_primer.rstrip('-'))

    return align
def maskPrimers(seq_file, primer_file, mode, align_func, align_args={}, 
                max_error=default_max_error, barcode=False,
                out_args=default_out_args, nproc=None, queue_size=None):
    """
    Masks or cuts primers from sample sequences using local alignment

    Arguments: 
    seq_file = name of file containing sample sequences
    primer_file = name of the file containing primer sequences
    mode = defines the action taken; one of 'cut','mask','tag'
    align_func = the function to call for alignment
    align_arcs = a dictionary of arguments to pass to align_func
    max_error = maximum acceptable error rate for a valid alignment
    barcode = if True add sequence preceding primer to description
    out_args = common output argument dictionary from parseCommonArgs
    nproc = the number of processQueue processes;
            if None defaults to the number of CPUs
    queue_size = maximum size of the argument queue;
                 if None defaults to 2*nproc
                 
    Returns:
    a list of successful output file names
    """
    # Define subcommand label dictionary
    cmd_dict = {alignPrimers:'align', scorePrimers:'score'}
    
    # Print parameter info
    log = OrderedDict()
    log['START'] = 'MaskPrimers'
    log['COMMAND'] = cmd_dict.get(align_func, align_func.__name__)
    log['SEQ_FILE'] = os.path.basename(seq_file)
    log['PRIMER_FILE'] = os.path.basename(primer_file)
    log['MODE'] = mode
    log['BARCODE'] = barcode
    log['MAX_ERROR'] = max_error
    if 'start' in align_args: log['START_POS'] = align_args['start']
    if 'max_len' in align_args: log['MAX_LEN'] = align_args['max_len']
    if 'rev_primer' in align_args: log['REV_PRIMER'] = align_args['rev_primer']
    if 'skip_rc' in align_args: log['SKIP_RC'] = align_args['skip_rc']
    if 'gap_penalty' in align_args:
        log['GAP_PENALTY'] = ', '.join([str(x) for x in align_args['gap_penalty']])
    log['NPROC'] = nproc
    printLog(log)

    # Create dictionary of primer sequences to pass to maskPrimers
    primers = readPrimerFile(primer_file)
    if 'rev_primer' in align_args and align_args['rev_primer']:
        primers = {k: reverseComplement(v) for k, v in primers.items()}

    # Define alignment arguments and compile primers for align mode
    align_args['primers'] = primers 
    align_args['score_dict'] = getDNAScoreDict(mask_score=(0, 1), gap_score=(0, 0))
    if align_func is alignPrimers:
        align_args['max_error'] = max_error
        align_args['primers_regex'] = compilePrimers(primers)
    
    # Define sequence masking arguments
    mask_args = {'mode': mode, 
                 'barcode': barcode, 
                 'delimiter': out_args['delimiter']}

    # Define feeder function and arguments
    feed_func = feedSeqQueue
    feed_args = {'seq_file': seq_file}
    # Define worker function and arguments
    work_func = processMPQueue
    work_args = {'align_func': align_func, 
                 'align_args': align_args,
                 'mask_args': mask_args,
                 'max_error': max_error}
    
    # Define collector function and arguments
    collect_func = collectSeqQueue
    collect_args = {'seq_file': seq_file,
                    'task_label': 'primers',
                    'out_args': out_args}
    
    # Call process manager
    result = manageProcesses(feed_func, work_func, collect_func, 
                             feed_args, work_args, collect_args, 
                             nproc, queue_size)

    # Print log
    result['log']['END'] = 'MaskPrimers'
    printLog(result['log'])
        
    return result['out_files']
def scorePrimers(seq_record, primers, start=default_start, rev_primer=False, 
                 score_dict=getDNAScoreDict(mask_score=(0, 1), gap_score=(0, 0))):
    """
    Performs simple alignment of primers with a fixed starting position, 
    no reverse complement alignment, and no tail alignment option

    Arguments: 
    seq_record = a SeqRecord object to align primers against
    primers = dictionary of {names: short IUPAC ambiguous sequence strings}
    start = position where primer alignment starts
    rev_primer = if True align with the tail end of the sequence
    score_dict = optional dictionary of {(char1, char2): score} alignment scores
    
    Returns:
    A PrimerAlignment object
    """
    # Create empty return dictionary
    seq_record = seq_record.upper()
    align = PrimerAlignment(seq_record)
    align.rev_primer = rev_primer

    # Define orientation variables
    seq_record.annotations['seqorient'] = 'F'
    seq_record.annotations['prorient'] = 'F' if not rev_primer else 'RC'

    # Score primers
    this_align = {}
    rec_len = len(seq_record)
    if rev_primer:  end = rec_len - start
    for adpt_id, adpt_seq in primers.items():
        if rev_primer:  start = end - len(adpt_seq)
        else:  end = start + len(adpt_seq)
        chars = zip(seq_record[start:end], adpt_seq)
        score = sum([score_dict[(c1, c2)] for c1, c2 in chars])
        this_align.update({adpt_id: (score, start, end)})

    # Determine primer with lowest error rate
    best_align, best_adpt, best_err = None, None, None
    for adpt, algn in this_align.items():
        #adpt_err = 1.0 - float(algn[0]) / weightSeq(primers[adpt])
        err = 1.0 - float(algn[0]) / len(primers[adpt])
        if best_err is None or err < best_err:
            best_align = algn
            best_adpt = adpt
            best_err = err

    # Set return dictionary to lowest error rate alignment
    if best_align:
        # Populate return object
        align.primer = best_adpt if best_err < 1.0 else None
        align.start = best_align[1]
        align.end = best_align[2]
        align.error = best_err
        align.valid = True

        # Determine alignment sequences
        if not rev_primer:
            align.align_seq = str(seq_record.seq[:best_align[2]])
            align.align_primer = '-' * best_align[1] + primers[best_adpt]
        else:
            align.align_seq = str(seq_record.seq[best_align[1]:])
            align.align_primer = primers[best_adpt] + '-' * (rec_len - best_align[2])
    
    return align