def test_read_denoiser_mapping(self): """read_denoiser_mapping reads correctly""" mapping = """1:\t2\t3 4:\t5\t6 7:""".split("\n") expected = {'1': ['2', '3'], '4': ['5', '6'], '7': []} self.assertEqual(read_denoiser_mapping(mapping), expected) # empty mapping gives empty result self.assertEqual(read_denoiser_mapping([]), {})
def test_read_denoiser_mapping(self): """read_denoiser_mapping reads correctly""" mapping = """1:\t2\t3 4:\t5\t6 7:""".split( "\n" ) expected = {"1": ["2", "3"], "4": ["5", "6"], "7": []} self.assertEqual(read_denoiser_mapping(mapping), expected) # empty mapping gives empty result self.assertEqual(read_denoiser_mapping([]), {})
def test_read_denoiser_mapping(self): """read_denoiser_mapping reads correctly""" mapping ="""1:\t2\t3 4:\t5\t6 7:""".split("\n") expected = {'1':['2','3'], '4':['5','6'], '7':[]} self.assertEqual(read_denoiser_mapping(mapping), expected) # empty mapping gives empty result self.assertEqual(read_denoiser_mapping([]), {})
def combine_mappings(fasta_fh, mapping_fh, denoised_seqs_fh, otu_picker_otu_map_fh, out_dir): """Combine denoiser and OTU picker mapping file, replace flowgram IDs. fasta_fh: a fasta file with labels as produced by Qiime's split_libraries.py used to replace flowgram id with the unique se_sample_id mapping_fh: The cluster mapping from the denoiser.py denoised_seqs_fh: the Fasta output files from denoiser.py otu_picker_map_fh: cluster map from otu picker on denoised_seqs_fh out_dir: output directory """ # read in mapping from split_library file labels = imap(lambda a_b: a_b[0], MinimalFastaParser(fasta_fh)) # mapping from seq_id to sample_id sample_id_mapping = extract_read_to_sample_mapping(labels) denoiser_mapping = read_denoiser_mapping(mapping_fh) # read in cd_hit otu map # and write out combined otu_picker+denoiser map otu_fh = open(out_dir + "/denoised_otu_map.txt", "w") for otu_line in otu_picker_otu_map_fh: otu_split = otu_line.split() otu = otu_split[0] ids = otu_split[1:] get_sample_id = sample_id_mapping.get # concat lists # make sure the biggest one is first for pick_repr all_ids = sort_ids(ids, denoiser_mapping) all_ids.extend(sum([denoiser_mapping[id] for id in ids], [])) try: otu_fh.write("%s\t" % otu + "\t".join(map(get_sample_id, all_ids)) + "\n") except TypeError: # get returns Null if denoiser_mapping id not present in # sample_id_mapping print "Found id in denoiser output, which was not found in split_libraries " +\ "output FASTA file. Wrong file?" exit() fasta_out_fh = open(out_dir + "/denoised_all.fasta", "w") for label, seq in MinimalFastaParser(denoised_seqs_fh): id = label.split()[0] newlabel = "%s %s" % (sample_id_mapping[id], id) fasta_out_fh.write(Sequence(name=newlabel, seq=seq).toFasta() + "\n")
def combine_mappings(fasta_fh, mapping_fh, denoised_seqs_fh, otu_picker_otu_map_fh, out_dir): """Combine denoiser and OTU picker mapping file, replace flowgram IDs. fasta_fh: a fasta file with labels as produced by Qiime's split_libraries.py used to replace flowgram id with the unique se_sample_id mapping_fh: The cluster mapping from the denoiser.py denoised_seqs_fh: the Fasta output files from denoiser.py otu_picker_map_fh: cluster map from otu picker on denoised_seqs_fh out_dir: output directory """ # read in mapping from split_library file labels = imap(lambda a_b: a_b[0], parse_fasta(fasta_fh)) # mapping from seq_id to sample_id sample_id_mapping = extract_read_to_sample_mapping(labels) denoiser_mapping = read_denoiser_mapping(mapping_fh) # read in cd_hit otu map # and write out combined otu_picker+denoiser map otu_fh = open(out_dir + "/denoised_otu_map.txt", "w") for otu_line in otu_picker_otu_map_fh: otu_split = otu_line.split() otu = otu_split[0] ids = otu_split[1:] get_sample_id = sample_id_mapping.get # concat lists # make sure the biggest one is first for pick_repr all_ids = sort_ids(ids, denoiser_mapping) all_ids.extend(sum([denoiser_mapping[id] for id in ids], [])) try: otu_fh.write("%s\t" % otu + "\t".join(map(get_sample_id, all_ids)) + "\n") except TypeError: # get returns Null if denoiser_mapping id not present in # sample_id_mapping print "Found id in denoiser output, which was not found in split_libraries " +\ "output FASTA file. Wrong file?" exit() fasta_out_fh = open(out_dir + "/denoised_all.fasta", "w") for label, seq in parse_fasta(denoised_seqs_fh): id = label.split()[0] newlabel = "%s %s" % (sample_id_mapping[id], id) fasta_out_fh.write(BiologicalSequence(seq, id=newlabel).to_fasta())
def read_preprocessed_data(out_fp="/tmp/"): """Read data of a previous preprocessing run. out_fp: output directory of previous preprocess run. Supposed to contain two files: - prefix_dereplicated.fasta - prefix_mapping.txt """ # read mapping, and extract seqs # mapping has fasta_header like this: # > id: count seqs = dict([(a.split(":")[0], b) for (a, b) in (parse_fasta(open(out_fp + "/prefix_dereplicated.fasta")))]) mapping = read_denoiser_mapping(open(out_fp + "/prefix_mapping.txt")) return (out_fp + "/prefix_dereplicated.sff.txt", len(mapping), mapping, seqs)
def read_preprocessed_data(out_fp="/tmp/"): """Read data of a previous preprocessing run. out_fp: output directory of previous preprocess run. Supposed to contain two files: - prefix_dereplicated.fasta - prefix_mapping.txt """ # read mapping, and extract seqs # mapping has fasta_header like this: # > id: count seqs = dict([(a.split(':')[0],b) for (a,b) in (MinimalFastaParser(open(out_fp+"/prefix_dereplicated.fasta")))]) mapping = read_denoiser_mapping(open(out_fp+"/prefix_mapping.txt")) return(out_fp+"/prefix_dereplicated.sff.txt", len(mapping), mapping, seqs)
def denoise_per_sample(sff_fps, fasta_fp, tmpoutdir, cluster=False, num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="", low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log", low_memory=False, verbose=False, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, titanium=False): """Denoise each sample separately""" # abort early if binary is missing check_flowgram_ali_exe() log_fh = None if log_fp: # switch of buffering for global log file log_fh = open(tmpoutdir + "/" + log_fp, "w", 0) # overwrite settings if titanium is set # This flag is only used from qiime. Remove after qiime integration if titanium: error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat" low_cutoff = 4 high_cutoff = 5 if verbose: log_fh.write("Denoiser version: %s\n" % __version__) log_fh.write("SFF files: %s\n" % ', '.join(sff_fps)) log_fh.write("Fasta file: %s\n" % fasta_fp) log_fh.write("Cluster: %s\n" % cluster) log_fh.write("Num CPUs: %d\n" % num_cpus) log_fh.write("Squeeze Seqs: %s\n" % squeeze) log_fh.write("tmpdir: %s\n\n" % tmpoutdir) log_fh.write("percent_id threshold: %.2f\n" % percent_id) log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail) log_fh.write("Error profile: %s\n" % error_profile) log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds) # here we go ... sff_files = split_sff(map(open, sff_fps), open(fasta_fp), tmpoutdir) combined_mapping = {} result_centroids = [] result_singletons_files = [] # denoise each sample separately for i, sff_file in enumerate(sff_files): if not exists(tmpoutdir + ("/%d" % i)): makedirs(tmpoutdir + ("/%d" % i)) out_fp = tmpoutdir + ("/%d/" % i) denoise_seqs([sff_file], fasta_fp, out_fp, None, cluster, num_cpus, squeeze, percent_id, bail, primer, low_cutoff, high_cutoff, log_fp, low_memory, verbose, error_profile, max_num_rounds) # collect partial results this_rounds_mapping = read_denoiser_mapping( open(out_fp + "/denoiser_mapping.txt")) combined_mapping.update(this_rounds_mapping) result_centroids.append( parse_fasta(open(out_fp + "/centroids.fasta"))) result_singletons_files.append(out_fp + "/singletons.fasta") # write the combined files store_mapping(combined_mapping, tmpoutdir, "denoiser") seqs = chain(*result_centroids) fasta_fh = open(tmpoutdir + "/denoised.fasta", "w") # write centroids sorted by clustersize write_Fasta_from_name_seq_pairs( sort_seqs_by_clustersize(seqs, combined_mapping), fasta_fh) for singleton_file in result_singletons_files: write_Fasta_from_name_seq_pairs( parse_fasta(open(singleton_file, "r")), fasta_fh) fasta_fh.close() # return outdir for tests/test_denoiser return tmpoutdir
def denoise_per_sample(sff_fps, fasta_fp, tmpoutdir, cluster=False, num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="", low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log", low_memory=False, verbose=False, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, titanium=False): """Denoise each sample separately""" # abort early if binary is missing check_flowgram_ali_exe() log_fh = None if log_fp: # switch of buffering for global log file log_fh = open(tmpoutdir + "/" + log_fp, "w", 0) # overwrite settings if titanium is set # This flag is only used from qiime. Remove after qiime integration if titanium: error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat" low_cutoff = 4 high_cutoff = 5 if verbose: log_fh.write("Denoiser version: %s\n" % __version__) log_fh.write("SFF files: %s\n" % ', '.join(sff_fps)) log_fh.write("Fasta file: %s\n" % fasta_fp) log_fh.write("Cluster: %s\n" % cluster) log_fh.write("Num CPUs: %d\n" % num_cpus) log_fh.write("Squeeze Seqs: %s\n" % squeeze) log_fh.write("tmpdir: %s\n\n" % tmpoutdir) log_fh.write("percent_id threshold: %.2f\n" % percent_id) log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail) log_fh.write("Error profile: %s\n" % error_profile) log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds) # here we go ... sff_files = split_sff(map(open, sff_fps), open(fasta_fp), tmpoutdir) combined_mapping = {} result_centroids = [] result_singletons_files = [] # denoise each sample separately for i, sff_file in enumerate(sff_files): if not exists(tmpoutdir + ("/%d" % i)): makedirs(tmpoutdir + ("/%d" % i)) out_fp = tmpoutdir + ("/%d/" % i) denoise_seqs([sff_file], fasta_fp, out_fp, None, cluster, num_cpus, squeeze, percent_id, bail, primer, low_cutoff, high_cutoff, log_fp, low_memory, verbose, error_profile, max_num_rounds) # collect partial results this_rounds_mapping = read_denoiser_mapping( open(out_fp + "/denoiser_mapping.txt")) combined_mapping.update(this_rounds_mapping) result_centroids.append(parse_fasta(open(out_fp + "/centroids.fasta"))) result_singletons_files.append(out_fp + "/singletons.fasta") # write the combined files store_mapping(combined_mapping, tmpoutdir, "denoiser") seqs = chain(*result_centroids) fasta_fh = open(tmpoutdir + "/denoised.fasta", "w") # write centroids sorted by clustersize write_Fasta_from_name_seq_pairs( sort_seqs_by_clustersize(seqs, combined_mapping), fasta_fh) for singleton_file in result_singletons_files: write_Fasta_from_name_seq_pairs(parse_fasta(open(singleton_file, "r")), fasta_fh) fasta_fh.close() # return outdir for tests/test_denoiser return tmpoutdir