Beispiel #1
0
def check_dna_chars_primers(header,
                            mapping_data,
                            errors,
                            disable_primer_check=False
                            ):
    """ Checks for valid DNA characters in primer fields

    Also flags empty fields as errors unless flags are passed to suppress
    barcode or primer checks.

    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    disable_primer_check:  If True, disables tests for valid primer sequences.
    """

    valid_dna_chars = DNASequence.iupac_characters()
    valid_dna_chars.add(',')

    # Detect fields directly, in case user does not have fields in proper
    # order in the mapping file (this will generate error separately)
    header_fields_to_check = ["ReversePrimer"]
    if not disable_primer_check:
        header_fields_to_check.append("LinkerPrimerSequence")

    check_indices = []

    for curr_field in range(len(header)):
        if header[curr_field] in header_fields_to_check:
            check_indices.append(curr_field)

    # Correction factor for header being the first line
    correction_ix = 1
    # Check for missing data
    for curr_data in range(len(mapping_data)):
        for curr_ix in check_indices:
            if len(mapping_data[curr_data][curr_ix]) == 0:
                errors.append("Missing expected DNA sequence\t%d,%d" %
                              (curr_data + correction_ix, curr_ix))

    # Check for non-DNA characters
    for curr_data in range(len(mapping_data)):
        for curr_ix in check_indices:
            for curr_nt in mapping_data[curr_data][curr_ix]:
                if curr_nt not in valid_dna_chars:
                    errors.append("Invalid DNA sequence detected: %s\t%d,%d" %
                                  (mapping_data[curr_data][curr_ix],
                                   curr_data + correction_ix, curr_ix))
                    continue

    return errors
Beispiel #2
0
def check_dna_chars_primers(header,
                            mapping_data,
                            errors,
                            disable_primer_check=False):
    """ Checks for valid DNA characters in primer fields

    Also flags empty fields as errors unless flags are passed to suppress
    barcode or primer checks.

    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    disable_primer_check:  If True, disables tests for valid primer sequences.
    """

    valid_dna_chars = DNASequence.iupac_characters()
    valid_dna_chars.add(',')

    # Detect fields directly, in case user does not have fields in proper
    # order in the mapping file (this will generate error separately)
    header_fields_to_check = ["ReversePrimer"]
    if not disable_primer_check:
        header_fields_to_check.append("LinkerPrimerSequence")

    check_indices = []

    for curr_field in range(len(header)):
        if header[curr_field] in header_fields_to_check:
            check_indices.append(curr_field)

    # Correction factor for header being the first line
    correction_ix = 1
    # Check for missing data
    for curr_data in range(len(mapping_data)):
        for curr_ix in check_indices:
            if len(mapping_data[curr_data][curr_ix]) == 0:
                errors.append("Missing expected DNA sequence\t%d,%d" %
                              (curr_data + correction_ix, curr_ix))

    # Check for non-DNA characters
    for curr_data in range(len(mapping_data)):
        for curr_ix in check_indices:
            for curr_nt in mapping_data[curr_data][curr_ix]:
                if curr_nt not in valid_dna_chars:
                    errors.append("Invalid DNA sequence detected: %s\t%d,%d" %
                                  (mapping_data[curr_data][curr_ix],
                                   curr_data + correction_ix, curr_ix))
                    continue

    return errors
def get_consensus(fasta_tempfile, min_consensus):
    """
    Returns consensus sequence from a set of sequences
    input: fasta file, min_consensus
    fasta_file should be in the following format:
    >random_bc|number
    seq
    >random_bc|number
    seq
    ....

    number = number of times this seq has appeared with this random_barcode
    Parameters
    ----------
    fasta_seqs: list
    min_consensus: float
    Returns
    ----------
    consensus_seq: string
        consensus sequence for the given list of sequences
    """
    seqs = list()
    counts = list()

    for label, seq in parse_fasta(fasta_tempfile):
        RE_output = search(r'\w+\|(\d+)', label)
        counts.append(int(RE_output.group(1)))
        seqs.append(seq)

    length = len(seqs[0])
    number_of_seqs = len(seqs)

    for seq_index in range(number_of_seqs):
        if len(seqs[seq_index]) != length:
            raise SeqLengthMismatchError()

    freq_this_pos_this_base = dict()
    count_of_seq_with_max_count = dict()

    for x in range(length):
        freq_this_pos_this_base[x] = dict()
        count_of_seq_with_max_count[x] = dict()

        for y in DNASequence.iupac_characters():
            freq_this_pos_this_base[x][y] = 0
            count_of_seq_with_max_count[x][y] = 0

        for this_seq_count, seq in enumerate(seqs):
            freq_this_pos_this_base[x][
                seq[x]] += counts[this_seq_count]
            if counts[this_seq_count] > count_of_seq_with_max_count[x][seq[x]]:
                count_of_seq_with_max_count[x][seq[x]] = counts[this_seq_count]

    consensus = list()
    for index in range(length):
        sorted_bases = sorted(
            freq_this_pos_this_base[index].iteritems(),
            key=lambda x: x[1])
        max_base, max_freq = sorted_bases[-1]

        for (counter, (b, n)) in enumerate(sorted_bases):
            if max_freq == n:
                try:
                    if (count_of_seq_with_max_count[counter][b] >
                            count_of_seq_with_max_count[counter][max_base]):
                        max_base = b
                except KeyError:
                    pass

        score = 10.0 * max_freq / number_of_seqs
        if score < min_consensus:
            raise LowConsensusScoreError()
        consensus.append(max_base)

    consensus_seq = ''.join(map(str, consensus))
    return consensus_seq