Example #1
0
   def test_sort_ids(self):
        """sort_ids sorts by abundance"""

        mapping = {"1":["0","2","5","6"],
                   "3":[],
                   "4":[],
                   "11":[1,2,3,4,5,6,7,8,9],
                   "8":["7"]}

        self.assertEqual(sort_ids(["1","3","4","8","11"], mapping),\
                            ["11","1","8","4","3"])
Example #2
0
   def test_sort_ids(self):
        """sort_ids sorts by abundance"""

        mapping = {"1":["0","2","5","6"],
                   "3":[],
                   "4":[],
                   "11":[1,2,3,4,5,6,7,8,9],
                   "8":["7"]}

        self.assertEqual(sort_ids(["1","3","4","8","11"], mapping),\
                            ["11","1","8","4","3"])
Example #3
0
def combine_mappings(fasta_fh, mapping_fh, denoised_seqs_fh,
                     otu_picker_otu_map_fh, out_dir):
    """Combine denoiser and OTU picker mapping file, replace flowgram IDs.

    fasta_fh: a fasta file with labels as produced by Qiime's split_libraries.py
             used to replace flowgram id with the unique se_sample_id

    mapping_fh: The cluster mapping from the denoiser.py

    denoised_seqs_fh: the Fasta output files from denoiser.py

    otu_picker_map_fh: cluster map from otu picker on denoised_seqs_fh

    out_dir: output directory
    """

    # read in mapping from split_library file
    labels = imap(lambda a_b: a_b[0], MinimalFastaParser(fasta_fh))
    # mapping from seq_id to sample_id
    sample_id_mapping = extract_read_to_sample_mapping(labels)

    denoiser_mapping = read_denoiser_mapping(mapping_fh)
    # read in cd_hit otu map
    # and write out combined otu_picker+denoiser map
    otu_fh = open(out_dir + "/denoised_otu_map.txt", "w")
    for otu_line in otu_picker_otu_map_fh:
        otu_split = otu_line.split()

        otu = otu_split[0]
        ids = otu_split[1:]

        get_sample_id = sample_id_mapping.get
        # concat lists
        # make sure the biggest one is first for pick_repr
        all_ids = sort_ids(ids, denoiser_mapping)
        all_ids.extend(sum([denoiser_mapping[id] for id in ids], []))
        try:
            otu_fh.write("%s\t" % otu +
                         "\t".join(map(get_sample_id, all_ids)) + "\n")
        except TypeError:
            # get returns Null if denoiser_mapping id not present in
            # sample_id_mapping
            print "Found id in denoiser output, which was not found in split_libraries " +\
                "output FASTA file. Wrong file?"
            exit()

    fasta_out_fh = open(out_dir + "/denoised_all.fasta", "w")
    for label, seq in MinimalFastaParser(denoised_seqs_fh):
        id = label.split()[0]
        newlabel = "%s %s" % (sample_id_mapping[id], id)
        fasta_out_fh.write(Sequence(name=newlabel, seq=seq).toFasta() + "\n")
def combine_mappings(fasta_fh, mapping_fh, denoised_seqs_fh,
                     otu_picker_otu_map_fh, out_dir):
    """Combine denoiser and OTU picker mapping file, replace flowgram IDs.

    fasta_fh: a fasta file with labels as produced by Qiime's split_libraries.py
             used to replace flowgram id with the unique se_sample_id

    mapping_fh: The cluster mapping from the denoiser.py

    denoised_seqs_fh: the Fasta output files from denoiser.py

    otu_picker_map_fh: cluster map from otu picker on denoised_seqs_fh

    out_dir: output directory
    """

     # read in mapping from split_library file
    labels = imap(lambda a_b: a_b[0], parse_fasta(fasta_fh))
    # mapping from seq_id to sample_id
    sample_id_mapping = extract_read_to_sample_mapping(labels)

    denoiser_mapping = read_denoiser_mapping(mapping_fh)
    # read in cd_hit otu map
    # and write out combined otu_picker+denoiser map
    otu_fh = open(out_dir + "/denoised_otu_map.txt", "w")
    for otu_line in otu_picker_otu_map_fh:
        otu_split = otu_line.split()

        otu = otu_split[0]
        ids = otu_split[1:]

        get_sample_id = sample_id_mapping.get
        # concat lists
        # make sure the biggest one is first for pick_repr
        all_ids = sort_ids(ids, denoiser_mapping)
        all_ids.extend(sum([denoiser_mapping[id] for id in ids], []))
        try:
            otu_fh.write("%s\t" % otu +
                         "\t".join(map(get_sample_id, all_ids)) + "\n")
        except TypeError:
            # get returns Null if denoiser_mapping id not present in
            # sample_id_mapping
            print "Found id in denoiser output, which was not found in split_libraries " +\
                "output FASTA file. Wrong file?"
            exit()

    fasta_out_fh = open(out_dir + "/denoised_all.fasta", "w")
    for label, seq in parse_fasta(denoised_seqs_fh):
        id = label.split()[0]
        newlabel = "%s %s" % (sample_id_mapping[id], id)
        fasta_out_fh.write(BiologicalSequence(seq, id=newlabel).to_fasta())