def process_mapping_file(map_f, barcode_len, barcode_type, BARCODE_COLUMN, REVERSE_PRIMER_COLUMN): """Ensures that sample IDs and barcodes are unique, that barcodes are all the same length, and that primers are present. Ensures barcodes and primers only contain valid characters. Parameters ---------- map_f: file metadata mapping file barcode_type: string barcode type, can be either integer or golay_12 barcode_len: int barcode length barcode_column: string header of barcode column reverse_primer_column: string header of the reverse primer column Returns ---------- bc_to_sid: dict bc_to_fwd_primers: dict bc_to_rev_primers: dict """ _, _, bc_to_sid, _, _, bc_to_fwd_primers, _ = check_map(map_f, False) map_f.seek(0) metadata_map = parse_mapping_file_to_dict(map_f)[0] bc_to_rev_primers = {} for sid, md in metadata_map.items(): if REVERSE_PRIMER_COLUMN in md: bc_to_rev_primers[ md[BARCODE_COLUMN]] = expand_degeneracies( md[REVERSE_PRIMER_COLUMN].upper().split(',')) else: raise Exception( "The %s column does not exist in the " "mapping file. %s is required." % (REVERSE_PRIMER_COLUMN, REVERSE_PRIMER_COLUMN)) check_barcodes(bc_to_sid, barcode_len, barcode_type) return (bc_to_sid, bc_to_fwd_primers, bc_to_rev_primers)
def get_mapping_details(mapping_fp): """ Returns SampleIDs, Barcodes, Primer seqs from mapping file mapping_fp: filepath to mapping file """ mapping_f = open(mapping_fp, "U") # Only using the id_map and the errors from parsing the mapping file. hds, mapping_data, run_description, errors, warnings = \ process_id_map(mapping_f) mapping_f.close() # Errors means problems with SampleIDs or headers if errors: raise ValueError,('Error in mapping file, please validate '+\ 'mapping file with check_id_map.py') # create dict of dicts with SampleID:{each header:mapping data} id_map = {} for curr_data in mapping_data: id_map[curr_data[0]] = {} for header in range(len(hds)): for curr_data in mapping_data: id_map[curr_data[0]][hds[header]] = curr_data[header] sample_ids = id_map.keys() barcode_seqs = [] raw_linkerprimer_seqs = [] for curr_id in id_map: barcode_seqs.append(id_map[curr_id]['BarcodeSequence']) raw_linkerprimer_seqs.append(id_map[curr_id]['LinkerPrimerSequence']) # remove duplicates raw_linkerprimer_seqs = set(raw_linkerprimer_seqs) linker_primer_seqs = expand_degeneracies(raw_linkerprimer_seqs) return set(sample_ids), set(barcode_seqs), set(linker_primer_seqs)
def get_mapping_details(mapping_fp, suppress_barcode_checks=False, suppress_primer_checks=False): """ Returns SampleIDs, Barcodes, Primer seqs from mapping file mapping_fp: filepath to mapping file suppress_barcode_checks=If True, will skip getting barcodes from mapping file and searching for these in sequences. suppress_primer_checks=If True, will skip getting primers from mapping file and searching for these in sequences """ mapping_f = open(mapping_fp, "U") # Only using the id_map and the errors from parsing the mapping file. hds, mapping_data, run_description, errors, warnings = \ process_id_map(mapping_f) mapping_f.close() # Should raise errors for barcodes or primers unless suppressed, and # should raise errors for headers or duplicate SampleIDs in any case. loc_bcs = ",1" loc_primers = ",2" if errors: for curr_error in errors: # Halt when header has error if curr_error.startswith("Found header field"): raise ValueError('Error in mapping file, please validate ' 'mapping file with validate_mapping_file.py') elif curr_error.endswith(loc_bcs): # Halt for barcode errors unless suppressed if suppress_barcode_checks: continue else: raise ValueError( 'Error in mapping file, please validate ' 'mapping file with validate_mapping_file.py') elif curr_error.endswith(loc_primers): # Halt for primer errors unless suppressed if suppress_primer_checks: continue else: raise ValueError( 'Error in mapping file, please validate ' 'mapping file with validate_mapping_file.py') # Raise error on duplicate sample IDs elif curr_error.startswith("Duplicate SampleID"): raise ValueError('Error in mapping file, please validate ' 'mapping file with validate_mapping_file.py') # create dict of dicts with SampleID:{each header:mapping data} id_map = {} for curr_data in mapping_data: id_map[curr_data[0]] = {} for header in range(len(hds)): for curr_data in mapping_data: id_map[curr_data[0]][hds[header]] = curr_data[header] sample_ids = id_map.keys() barcode_seqs = [] raw_linkerprimer_seqs = [] for curr_id in id_map: if not suppress_barcode_checks: barcode_seqs.append(id_map[curr_id]['BarcodeSequence']) if not suppress_primer_checks: raw_linkerprimer_seqs.append( id_map[curr_id]['LinkerPrimerSequence']) # remove duplicates raw_linkerprimer_seqs = set(raw_linkerprimer_seqs) linker_primer_seqs = expand_degeneracies(raw_linkerprimer_seqs) return set(sample_ids), set(barcode_seqs), set(linker_primer_seqs)
def get_mapping_details(mapping_fp, suppress_barcode_checks=False, suppress_primer_checks=False): """ Returns SampleIDs, Barcodes, Primer seqs from mapping file mapping_fp: filepath to mapping file suppress_barcode_checks=If True, will skip getting barcodes from mapping file and searching for these in sequences. suppress_primer_checks=If True, will skip getting primers from mapping file and searching for these in sequences """ mapping_f = open(mapping_fp, "U") # Only using the id_map and the errors from parsing the mapping file. hds, mapping_data, run_description, errors, warnings = \ process_id_map(mapping_f) mapping_f.close() # Should raise errors for barcodes or primers unless suppressed, and # should raise errors for headers or duplicate SampleIDs in any case. loc_bcs = ",1" loc_primers = ",2" if errors: for curr_error in errors: # Halt when header has error if curr_error.startswith("Found header field"): raise ValueError('Error in mapping file, please validate ' 'mapping file with validate_mapping_file.py') elif curr_error.endswith(loc_bcs): # Halt for barcode errors unless suppressed if suppress_barcode_checks: continue else: raise ValueError('Error in mapping file, please validate ' 'mapping file with validate_mapping_file.py') elif curr_error.endswith(loc_primers): # Halt for primer errors unless suppressed if suppress_primer_checks: continue else: raise ValueError('Error in mapping file, please validate ' 'mapping file with validate_mapping_file.py') # Raise error on duplicate sample IDs elif curr_error.startswith("Duplicate SampleID"): raise ValueError('Error in mapping file, please validate ' 'mapping file with validate_mapping_file.py') # create dict of dicts with SampleID:{each header:mapping data} id_map = {} for curr_data in mapping_data: id_map[curr_data[0]] = {} for header in range(len(hds)): for curr_data in mapping_data: id_map[curr_data[0]][hds[header]] = curr_data[header] sample_ids = id_map.keys() barcode_seqs = [] raw_linkerprimer_seqs = [] for curr_id in id_map: if not suppress_barcode_checks: barcode_seqs.append(id_map[curr_id]['BarcodeSequence']) if not suppress_primer_checks: raw_linkerprimer_seqs.append( id_map[curr_id]['LinkerPrimerSequence']) # remove duplicates raw_linkerprimer_seqs = set(raw_linkerprimer_seqs) linker_primer_seqs = expand_degeneracies(raw_linkerprimer_seqs) return set(sample_ids), set(barcode_seqs), set(linker_primer_seqs)
def test_expand_degeneracies(self): """generate_possibilities should make possible strings""" self.assertEqual(expand_degeneracies('ACG'), ['ACG']) self.assertEqual(expand_degeneracies('RGY'), ['AGT', 'AGC', 'GGT', 'GGC'])