Beispiel #1
0
    def test_read_denoiser_mapping(self):
        """read_denoiser_mapping reads correctly"""

        mapping = """1:\t2\t3
4:\t5\t6
7:""".split("\n")
        expected = {'1': ['2', '3'], '4': ['5', '6'], '7': []}
        self.assertEqual(read_denoiser_mapping(mapping), expected)

        # empty mapping gives empty result
        self.assertEqual(read_denoiser_mapping([]), {})
Beispiel #2
0
    def test_read_denoiser_mapping(self):
        """read_denoiser_mapping reads correctly"""

        mapping = """1:\t2\t3
4:\t5\t6
7:""".split(
            "\n"
        )
        expected = {"1": ["2", "3"], "4": ["5", "6"], "7": []}
        self.assertEqual(read_denoiser_mapping(mapping), expected)

        # empty mapping gives empty result
        self.assertEqual(read_denoiser_mapping([]), {})
Beispiel #3
0
   def test_read_denoiser_mapping(self):
        """read_denoiser_mapping reads correctly"""

        mapping ="""1:\t2\t3
4:\t5\t6
7:""".split("\n")
        expected = {'1':['2','3'],
                    '4':['5','6'],
                    '7':[]}
        self.assertEqual(read_denoiser_mapping(mapping),
                         expected)

        # empty mapping gives empty result
        self.assertEqual(read_denoiser_mapping([]), {})
Beispiel #4
0
def combine_mappings(fasta_fh, mapping_fh, denoised_seqs_fh,
                     otu_picker_otu_map_fh, out_dir):
    """Combine denoiser and OTU picker mapping file, replace flowgram IDs.

    fasta_fh: a fasta file with labels as produced by Qiime's split_libraries.py
             used to replace flowgram id with the unique se_sample_id

    mapping_fh: The cluster mapping from the denoiser.py

    denoised_seqs_fh: the Fasta output files from denoiser.py

    otu_picker_map_fh: cluster map from otu picker on denoised_seqs_fh

    out_dir: output directory
    """

    # read in mapping from split_library file
    labels = imap(lambda a_b: a_b[0], MinimalFastaParser(fasta_fh))
    # mapping from seq_id to sample_id
    sample_id_mapping = extract_read_to_sample_mapping(labels)

    denoiser_mapping = read_denoiser_mapping(mapping_fh)
    # read in cd_hit otu map
    # and write out combined otu_picker+denoiser map
    otu_fh = open(out_dir + "/denoised_otu_map.txt", "w")
    for otu_line in otu_picker_otu_map_fh:
        otu_split = otu_line.split()

        otu = otu_split[0]
        ids = otu_split[1:]

        get_sample_id = sample_id_mapping.get
        # concat lists
        # make sure the biggest one is first for pick_repr
        all_ids = sort_ids(ids, denoiser_mapping)
        all_ids.extend(sum([denoiser_mapping[id] for id in ids], []))
        try:
            otu_fh.write("%s\t" % otu +
                         "\t".join(map(get_sample_id, all_ids)) + "\n")
        except TypeError:
            # get returns Null if denoiser_mapping id not present in
            # sample_id_mapping
            print "Found id in denoiser output, which was not found in split_libraries " +\
                "output FASTA file. Wrong file?"
            exit()

    fasta_out_fh = open(out_dir + "/denoised_all.fasta", "w")
    for label, seq in MinimalFastaParser(denoised_seqs_fh):
        id = label.split()[0]
        newlabel = "%s %s" % (sample_id_mapping[id], id)
        fasta_out_fh.write(Sequence(name=newlabel, seq=seq).toFasta() + "\n")
def combine_mappings(fasta_fh, mapping_fh, denoised_seqs_fh,
                     otu_picker_otu_map_fh, out_dir):
    """Combine denoiser and OTU picker mapping file, replace flowgram IDs.

    fasta_fh: a fasta file with labels as produced by Qiime's split_libraries.py
             used to replace flowgram id with the unique se_sample_id

    mapping_fh: The cluster mapping from the denoiser.py

    denoised_seqs_fh: the Fasta output files from denoiser.py

    otu_picker_map_fh: cluster map from otu picker on denoised_seqs_fh

    out_dir: output directory
    """

     # read in mapping from split_library file
    labels = imap(lambda a_b: a_b[0], parse_fasta(fasta_fh))
    # mapping from seq_id to sample_id
    sample_id_mapping = extract_read_to_sample_mapping(labels)

    denoiser_mapping = read_denoiser_mapping(mapping_fh)
    # read in cd_hit otu map
    # and write out combined otu_picker+denoiser map
    otu_fh = open(out_dir + "/denoised_otu_map.txt", "w")
    for otu_line in otu_picker_otu_map_fh:
        otu_split = otu_line.split()

        otu = otu_split[0]
        ids = otu_split[1:]

        get_sample_id = sample_id_mapping.get
        # concat lists
        # make sure the biggest one is first for pick_repr
        all_ids = sort_ids(ids, denoiser_mapping)
        all_ids.extend(sum([denoiser_mapping[id] for id in ids], []))
        try:
            otu_fh.write("%s\t" % otu +
                         "\t".join(map(get_sample_id, all_ids)) + "\n")
        except TypeError:
            # get returns Null if denoiser_mapping id not present in
            # sample_id_mapping
            print "Found id in denoiser output, which was not found in split_libraries " +\
                "output FASTA file. Wrong file?"
            exit()

    fasta_out_fh = open(out_dir + "/denoised_all.fasta", "w")
    for label, seq in parse_fasta(denoised_seqs_fh):
        id = label.split()[0]
        newlabel = "%s %s" % (sample_id_mapping[id], id)
        fasta_out_fh.write(BiologicalSequence(seq, id=newlabel).to_fasta())
Beispiel #6
0
def read_preprocessed_data(out_fp="/tmp/"):
    """Read data of a previous preprocessing run.

    out_fp: output directory of previous preprocess run.
            Supposed to contain two files:
              - prefix_dereplicated.fasta
              - prefix_mapping.txt
    """
    # read mapping, and extract seqs
    # mapping has fasta_header like this:
    #  > id:   count

    seqs = dict([(a.split(":")[0], b) for (a, b) in (parse_fasta(open(out_fp + "/prefix_dereplicated.fasta")))])

    mapping = read_denoiser_mapping(open(out_fp + "/prefix_mapping.txt"))

    return (out_fp + "/prefix_dereplicated.sff.txt", len(mapping), mapping, seqs)
Beispiel #7
0
def read_preprocessed_data(out_fp="/tmp/"):
    """Read data of a previous preprocessing run.

    out_fp: output directory of previous preprocess run.
            Supposed to contain two files:
              - prefix_dereplicated.fasta
              - prefix_mapping.txt
    """
    # read mapping, and extract seqs
    # mapping has fasta_header like this:
    #  > id:   count

    seqs = dict([(a.split(':')[0],b) for (a,b) in
                (MinimalFastaParser(open(out_fp+"/prefix_dereplicated.fasta")))])

    mapping = read_denoiser_mapping(open(out_fp+"/prefix_mapping.txt"))

    return(out_fp+"/prefix_dereplicated.sff.txt", len(mapping), mapping, seqs)
Beispiel #8
0
def denoise_per_sample(sff_fps, fasta_fp, tmpoutdir, cluster=False,
                       num_cpus=1, squeeze=True, percent_id=0.97, bail=1,
                       primer="", low_cutoff=3.75, high_cutoff=4.5,
                       log_fp="denoiser.log", low_memory=False, verbose=False,
                       error_profile=DENOISER_DATA_DIR +
                       'FLX_error_profile.dat',
                       max_num_rounds=None, titanium=False):
    """Denoise each sample separately"""

    # abort early if binary is missing
    check_flowgram_ali_exe()

    log_fh = None
    if log_fp:
        # switch of buffering for global log file
        log_fh = open(tmpoutdir + "/" + log_fp, "w", 0)

    # overwrite settings if titanium is set
    # This flag is only used from qiime. Remove after qiime integration
    if titanium:
        error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat"
        low_cutoff = 4
        high_cutoff = 5

    if verbose:
        log_fh.write("Denoiser version: %s\n" % __version__)
        log_fh.write("SFF files: %s\n" % ', '.join(sff_fps))
        log_fh.write("Fasta file: %s\n" % fasta_fp)
        log_fh.write("Cluster: %s\n" % cluster)
        log_fh.write("Num CPUs: %d\n" % num_cpus)
        log_fh.write("Squeeze Seqs: %s\n" % squeeze)
        log_fh.write("tmpdir: %s\n\n" % tmpoutdir)
        log_fh.write("percent_id threshold: %.2f\n" % percent_id)
        log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail)
        log_fh.write("Error profile: %s\n" % error_profile)
        log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds)

    # here we go ...
    sff_files = split_sff(map(open, sff_fps), open(fasta_fp), tmpoutdir)
    combined_mapping = {}
    result_centroids = []
    result_singletons_files = []
    # denoise each sample separately
    for i, sff_file in enumerate(sff_files):
        if not exists(tmpoutdir + ("/%d" % i)):
            makedirs(tmpoutdir + ("/%d" % i))
        out_fp = tmpoutdir + ("/%d/" % i)
        denoise_seqs([sff_file], fasta_fp, out_fp, None, cluster,
                     num_cpus, squeeze, percent_id, bail, primer,
                     low_cutoff, high_cutoff, log_fp, low_memory,
                     verbose, error_profile, max_num_rounds)

        # collect partial results
        this_rounds_mapping = read_denoiser_mapping(
            open(out_fp + "/denoiser_mapping.txt"))
        combined_mapping.update(this_rounds_mapping)
        result_centroids.append(
            parse_fasta(open(out_fp + "/centroids.fasta")))
        result_singletons_files.append(out_fp + "/singletons.fasta")

    # write the combined files
    store_mapping(combined_mapping, tmpoutdir, "denoiser")
    seqs = chain(*result_centroids)
    fasta_fh = open(tmpoutdir + "/denoised.fasta", "w")
    # write centroids sorted by clustersize
    write_Fasta_from_name_seq_pairs(
        sort_seqs_by_clustersize(seqs, combined_mapping),
        fasta_fh)
    for singleton_file in result_singletons_files:
        write_Fasta_from_name_seq_pairs(
            parse_fasta(open(singleton_file, "r")),
            fasta_fh)
    fasta_fh.close()

    # return outdir for tests/test_denoiser
    return tmpoutdir
Beispiel #9
0
def denoise_per_sample(sff_fps,
                       fasta_fp,
                       tmpoutdir,
                       cluster=False,
                       num_cpus=1,
                       squeeze=True,
                       percent_id=0.97,
                       bail=1,
                       primer="",
                       low_cutoff=3.75,
                       high_cutoff=4.5,
                       log_fp="denoiser.log",
                       low_memory=False,
                       verbose=False,
                       error_profile=DENOISER_DATA_DIR +
                       'FLX_error_profile.dat',
                       max_num_rounds=None,
                       titanium=False):
    """Denoise each sample separately"""

    # abort early if binary is missing
    check_flowgram_ali_exe()

    log_fh = None
    if log_fp:
        # switch of buffering for global log file
        log_fh = open(tmpoutdir + "/" + log_fp, "w", 0)

    # overwrite settings if titanium is set
    # This flag is only used from qiime. Remove after qiime integration
    if titanium:
        error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat"
        low_cutoff = 4
        high_cutoff = 5

    if verbose:
        log_fh.write("Denoiser version: %s\n" % __version__)
        log_fh.write("SFF files: %s\n" % ', '.join(sff_fps))
        log_fh.write("Fasta file: %s\n" % fasta_fp)
        log_fh.write("Cluster: %s\n" % cluster)
        log_fh.write("Num CPUs: %d\n" % num_cpus)
        log_fh.write("Squeeze Seqs: %s\n" % squeeze)
        log_fh.write("tmpdir: %s\n\n" % tmpoutdir)
        log_fh.write("percent_id threshold: %.2f\n" % percent_id)
        log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail)
        log_fh.write("Error profile: %s\n" % error_profile)
        log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds)

    # here we go ...
    sff_files = split_sff(map(open, sff_fps), open(fasta_fp), tmpoutdir)
    combined_mapping = {}
    result_centroids = []
    result_singletons_files = []
    # denoise each sample separately
    for i, sff_file in enumerate(sff_files):
        if not exists(tmpoutdir + ("/%d" % i)):
            makedirs(tmpoutdir + ("/%d" % i))
        out_fp = tmpoutdir + ("/%d/" % i)
        denoise_seqs([sff_file], fasta_fp, out_fp, None, cluster, num_cpus,
                     squeeze, percent_id, bail, primer, low_cutoff,
                     high_cutoff, log_fp, low_memory, verbose, error_profile,
                     max_num_rounds)

        # collect partial results
        this_rounds_mapping = read_denoiser_mapping(
            open(out_fp + "/denoiser_mapping.txt"))
        combined_mapping.update(this_rounds_mapping)
        result_centroids.append(parse_fasta(open(out_fp + "/centroids.fasta")))
        result_singletons_files.append(out_fp + "/singletons.fasta")

    # write the combined files
    store_mapping(combined_mapping, tmpoutdir, "denoiser")
    seqs = chain(*result_centroids)
    fasta_fh = open(tmpoutdir + "/denoised.fasta", "w")
    # write centroids sorted by clustersize
    write_Fasta_from_name_seq_pairs(
        sort_seqs_by_clustersize(seqs, combined_mapping), fasta_fh)
    for singleton_file in result_singletons_files:
        write_Fasta_from_name_seq_pairs(parse_fasta(open(singleton_file, "r")),
                                        fasta_fh)
    fasta_fh.close()

    # return outdir for tests/test_denoiser
    return tmpoutdir