def check_dna_chars_primers(header, mapping_data, errors, disable_primer_check=False ): """ Checks for valid DNA characters in primer fields Also flags empty fields as errors unless flags are passed to suppress barcode or primer checks. header: list of header strings mapping_data: list of lists of raw metadata mapping file data errors: list of errors disable_primer_check: If True, disables tests for valid primer sequences. """ valid_dna_chars = DNASequence.iupac_characters() valid_dna_chars.add(',') # Detect fields directly, in case user does not have fields in proper # order in the mapping file (this will generate error separately) header_fields_to_check = ["ReversePrimer"] if not disable_primer_check: header_fields_to_check.append("LinkerPrimerSequence") check_indices = [] for curr_field in range(len(header)): if header[curr_field] in header_fields_to_check: check_indices.append(curr_field) # Correction factor for header being the first line correction_ix = 1 # Check for missing data for curr_data in range(len(mapping_data)): for curr_ix in check_indices: if len(mapping_data[curr_data][curr_ix]) == 0: errors.append("Missing expected DNA sequence\t%d,%d" % (curr_data + correction_ix, curr_ix)) # Check for non-DNA characters for curr_data in range(len(mapping_data)): for curr_ix in check_indices: for curr_nt in mapping_data[curr_data][curr_ix]: if curr_nt not in valid_dna_chars: errors.append("Invalid DNA sequence detected: %s\t%d,%d" % (mapping_data[curr_data][curr_ix], curr_data + correction_ix, curr_ix)) continue return errors
def check_dna_chars_primers(header, mapping_data, errors, disable_primer_check=False): """ Checks for valid DNA characters in primer fields Also flags empty fields as errors unless flags are passed to suppress barcode or primer checks. header: list of header strings mapping_data: list of lists of raw metadata mapping file data errors: list of errors disable_primer_check: If True, disables tests for valid primer sequences. """ valid_dna_chars = DNASequence.iupac_characters() valid_dna_chars.add(',') # Detect fields directly, in case user does not have fields in proper # order in the mapping file (this will generate error separately) header_fields_to_check = ["ReversePrimer"] if not disable_primer_check: header_fields_to_check.append("LinkerPrimerSequence") check_indices = [] for curr_field in range(len(header)): if header[curr_field] in header_fields_to_check: check_indices.append(curr_field) # Correction factor for header being the first line correction_ix = 1 # Check for missing data for curr_data in range(len(mapping_data)): for curr_ix in check_indices: if len(mapping_data[curr_data][curr_ix]) == 0: errors.append("Missing expected DNA sequence\t%d,%d" % (curr_data + correction_ix, curr_ix)) # Check for non-DNA characters for curr_data in range(len(mapping_data)): for curr_ix in check_indices: for curr_nt in mapping_data[curr_data][curr_ix]: if curr_nt not in valid_dna_chars: errors.append("Invalid DNA sequence detected: %s\t%d,%d" % (mapping_data[curr_data][curr_ix], curr_data + correction_ix, curr_ix)) continue return errors
def get_consensus(fasta_tempfile, min_consensus): """ Returns consensus sequence from a set of sequences input: fasta file, min_consensus fasta_file should be in the following format: >random_bc|number seq >random_bc|number seq .... number = number of times this seq has appeared with this random_barcode Parameters ---------- fasta_seqs: list min_consensus: float Returns ---------- consensus_seq: string consensus sequence for the given list of sequences """ seqs = list() counts = list() for label, seq in parse_fasta(fasta_tempfile): RE_output = search(r'\w+\|(\d+)', label) counts.append(int(RE_output.group(1))) seqs.append(seq) length = len(seqs[0]) number_of_seqs = len(seqs) for seq_index in range(number_of_seqs): if len(seqs[seq_index]) != length: raise SeqLengthMismatchError() freq_this_pos_this_base = dict() count_of_seq_with_max_count = dict() for x in range(length): freq_this_pos_this_base[x] = dict() count_of_seq_with_max_count[x] = dict() for y in DNASequence.iupac_characters(): freq_this_pos_this_base[x][y] = 0 count_of_seq_with_max_count[x][y] = 0 for this_seq_count, seq in enumerate(seqs): freq_this_pos_this_base[x][ seq[x]] += counts[this_seq_count] if counts[this_seq_count] > count_of_seq_with_max_count[x][seq[x]]: count_of_seq_with_max_count[x][seq[x]] = counts[this_seq_count] consensus = list() for index in range(length): sorted_bases = sorted( freq_this_pos_this_base[index].iteritems(), key=lambda x: x[1]) max_base, max_freq = sorted_bases[-1] for (counter, (b, n)) in enumerate(sorted_bases): if max_freq == n: try: if (count_of_seq_with_max_count[counter][b] > count_of_seq_with_max_count[counter][max_base]): max_base = b except KeyError: pass score = 10.0 * max_freq / number_of_seqs if score < min_consensus: raise LowConsensusScoreError() consensus.append(max_base) consensus_seq = ''.join(map(str, consensus)) return consensus_seq