Ejemplo n.º 1
0
def main():
    """run denoiser on input flowgrams"""
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    sff_files = opts.sff_fps

    for f in sff_files:
        if (not exists(f)):
            option_parser.error(('Flowgram file path does not exist:\n %s \n' +
                                 'Pass a valid one via -i.') % f)
    outdir = opts.output_dir

    create_dir(outdir, fail_on_exist=not opts.force)

    log_fh = None

    if (not (opts.primer or opts.map_fname)):
        raise ApplicationError("Either mapping file or primer required")
    # Read primer from Meta data file if not set on command line
    if not opts.primer:
        mapping_data, header, comments = \
            parse_mapping_file(open(opts.map_fname, "U"))

        index = header.index("LinkerPrimerSequence")
        all_primers = set(array(mapping_data)[:, index])

        if len(all_primers) != 1:
            raise ValueError(
                "Currently only data sets with one primer are allowed.\n" +
                "Make separate mapping files with only one primer, re-run split_libraries and\n"
                + "denoise with each split_library output separately.")
        primer = list(all_primers)[0]
        last_char = primer[-1]
        if (last_char not in "ACGT"):
            raise ValueError("We currently do not support primer with " +
                             "degenerate bases at it's 3' end.")

    else:
        primer = opts.primer

    centroids, cluster_mapping = fast_denoiser(opts.sff_fps,
                                               opts.fasta_fp,
                                               outdir,
                                               opts.num_cpus,
                                               primer,
                                               titanium=opts.titanium)

    # store mapping file and centroids
    result_otu_path = '%s/denoised_clusters.txt' % outdir
    of = open(result_otu_path, 'w')
    for i, cluster in cluster_mapping.iteritems():
        of.write('%s\t%s\n' % (str(i), '\t'.join(cluster)))
    of.close()

    result_fasta_path = '%s/denoised_seqs.fasta' % outdir
    oh = open(result_fasta_path, 'w')
    write_Fasta_from_name_seq_pairs(centroids, oh)
Ejemplo n.º 2
0
def main():
    """run denoiser on input flowgrams"""
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    sff_files = opts.sff_fps

    for f in sff_files:
        if (not exists(f)):
            option_parser.error(('Flowgram file path does not exist:\n %s \n' +
                                 'Pass a valid one via -i.') % f)
    outdir = opts.output_dir

    create_dir(outdir, fail_on_exist=not opts.force)

    log_fh = None

    if (not (opts.primer or opts.map_fname)):
        raise ApplicationError("Either mapping file or primer required")
    # Read primer from Meta data file if not set on command line
    if not opts.primer:
        mapping_data, header, comments = \
            parse_mapping_file(open(opts.map_fname, "U"))

        index = header.index("LinkerPrimerSequence")
        all_primers = set(array(mapping_data)[:, index])

        if len(all_primers) != 1:
            raise ValueError("Currently only data sets with one primer are allowed.\n" +
                             "Make separate mapping files with only one primer, re-run split_libraries and\n"
                             + "denoise with each split_library output separately.")
        primer = list(all_primers)[0]
        last_char = primer[-1]
        if(last_char not in "ACGT"):
            raise ValueError("We currently do not support primer with " +
                             "degenerate bases at it's 3' end.")

    else:
        primer = opts.primer

    centroids, cluster_mapping = fast_denoiser(opts.sff_fps, opts.fasta_fp,
                                               outdir, opts.num_cpus, primer,
                                               titanium=opts.titanium)

    # store mapping file and centroids
    result_otu_path = '%s/denoised_clusters.txt' % outdir
    of = open(result_otu_path, 'w')
    for i, cluster in cluster_mapping.iteritems():
        of.write('%s\t%s\n' % (str(i), '\t'.join(cluster)))
    of.close()

    result_fasta_path = '%s/denoised_seqs.fasta' % outdir
    oh = open(result_fasta_path, 'w')
    write_Fasta_from_name_seq_pairs(centroids, oh)
Ejemplo n.º 3
0
    def test_write_Fasta_from_name_seqs_pairs(self):
        """write_Fasta_from_name_seqs_pairs write proper FASTA string."""
        
        seqs = [('1',"AAA"),('2',"CCCCC"),('3',"GGGG")]

        #None fh raises Error
        self.assertRaises(ValueError, write_Fasta_from_name_seq_pairs,seqs,None)

        tmp_filename = get_tmp_filename(prefix="test_write_Fasta", suffix=".fna")
        fh = open(tmp_filename,"w")
        write_Fasta_from_name_seq_pairs(seqs,fh)
        fh.close()
        actual_seqs = list(MinimalFastaParser(open(tmp_filename,"U")))
        remove(tmp_filename)
        
        self.assertEqual(actual_seqs, seqs)
Ejemplo n.º 4
0
    def test_write_Fasta_from_name_seqs_pairs(self):
        """write_Fasta_from_name_seqs_pairs write proper FASTA string."""
        
        seqs = [('1',"AAA"),('2',"CCCCC"),('3',"GGGG")]

        #None fh raises Error
        self.assertRaises(ValueError, write_Fasta_from_name_seq_pairs,seqs,None)

        tmp_filename = get_tmp_filename(prefix="test_write_Fasta", suffix=".fna")
        fh = open(tmp_filename,"w")
        write_Fasta_from_name_seq_pairs(seqs,fh)
        fh.close()
        actual_seqs = list(MinimalFastaParser(open(tmp_filename,"U")))
        remove(tmp_filename)
        
        self.assertEqual(actual_seqs, seqs)
Ejemplo n.º 5
0
    def test_write_Fasta_from_name_seqs_pairs(self):
        """write_Fasta_from_name_seqs_pairs write proper FASTA string."""

        seqs = [("1", "AAA"), ("2", "CCCCC"), ("3", "GGGG")]

        # None fh raises Error
        self.assertRaises(ValueError, write_Fasta_from_name_seq_pairs, seqs, None)

        fd, tmp_filename = mkstemp(prefix="test_write_Fasta", suffix=".fna")
        close(fd)
        fh = open(tmp_filename, "w")
        write_Fasta_from_name_seq_pairs(seqs, fh)
        fh.close()
        actual_seqs = list(parse_fasta(open(tmp_filename, "U")))
        remove(tmp_filename)

        self.assertEqual(actual_seqs, seqs)
Ejemplo n.º 6
0
def main():
    """run denoiser on input flowgrams"""
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    sff_files = opts.sff_fps

    for f in sff_files:
        if (not exists(f)):
            option_parser.error(('Flowgram file path does not exist:\n %s \n'+\
                                 'Pass a valid one via -i.')% f)
    outdir = opts.output_dir

    ret_val = create_dir(outdir, handle_errors_externally=True)
    if ret_val == 1:  #dir exists
        if opts.force:
            #do nothing, just overwrite content
            pass
        else:
            # Since the analysis can take quite a while, I put this check
            # in to help users avoid overwriting previous output.
            option_parser.error("Output directory already exists. Please choose"+\
                " a different directory, or force overwrite with -f.")

    else:
        handle_error_codes(outdir, error_code=ret_val)

    log_fh = None

    if (not (opts.primer or opts.map_fname)):
        raise ApplicationError, "Either mapping file or primer required"
    #Read primer from Meta data file if not set on command line
    if not opts.primer:
        mapping_data, header, comments = \
            parse_mapping_file(open(opts.map_fname,"U"))

        index = header.index("LinkerPrimerSequence")
        all_primers = set(array(mapping_data)[:, index])

        if len(all_primers) != 1:
            raise ValueError,"Currently only data sets with one primer are allowed.\n"+\
                "Make separate mapping files with only one primer, re-run split_libraries and\n"\
                +"denoise with each split_library output separately."
        primer = list(all_primers)[0]
        last_char = primer[-1]
        if (last_char not in "ACGT"):
            raise ValueError,"We currently do not support primer with "+\
                "degenerate bases at it's 3' end."

    else:
        primer = opts.primer

    centroids, cluster_mapping = fast_denoiser(opts.sff_fps,
                                               opts.fasta_fp,
                                               outdir,
                                               opts.num_cpus,
                                               primer,
                                               titanium=opts.titanium)

    # store mapping file and centroids
    result_otu_path = '%s/denoised_clusters.txt' % outdir
    of = open(result_otu_path, 'w')
    for i, cluster in cluster_mapping.iteritems():
        of.write('%s\t%s\n' % (str(i), '\t'.join(cluster)))
    of.close()

    result_fasta_path = '%s/denoised_seqs.fasta' % outdir
    oh = open(result_fasta_path, 'w')
    write_Fasta_from_name_seq_pairs(centroids, oh)
Ejemplo n.º 7
0
def denoise_per_sample(sff_fps, fasta_fp, tmpoutdir, cluster=False,
                       num_cpus=1, squeeze=True, percent_id=0.97, bail=1,
                       primer="", low_cutoff=3.75, high_cutoff=4.5,
                       log_fp="denoiser.log", low_memory=False, verbose=False,
                       error_profile=DENOISER_DATA_DIR +
                       'FLX_error_profile.dat',
                       max_num_rounds=None, titanium=False):
    """Denoise each sample separately"""

    # abort early if binary is missing
    check_flowgram_ali_exe()

    log_fh = None
    if log_fp:
        # switch of buffering for global log file
        log_fh = open(tmpoutdir + "/" + log_fp, "w", 0)

    # overwrite settings if titanium is set
    # This flag is only used from qiime. Remove after qiime integration
    if titanium:
        error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat"
        low_cutoff = 4
        high_cutoff = 5

    if verbose:
        log_fh.write("Denoiser version: %s\n" % __version__)
        log_fh.write("SFF files: %s\n" % ', '.join(sff_fps))
        log_fh.write("Fasta file: %s\n" % fasta_fp)
        log_fh.write("Cluster: %s\n" % cluster)
        log_fh.write("Num CPUs: %d\n" % num_cpus)
        log_fh.write("Squeeze Seqs: %s\n" % squeeze)
        log_fh.write("tmpdir: %s\n\n" % tmpoutdir)
        log_fh.write("percent_id threshold: %.2f\n" % percent_id)
        log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail)
        log_fh.write("Error profile: %s\n" % error_profile)
        log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds)

    # here we go ...
    sff_files = split_sff(map(open, sff_fps), open(fasta_fp), tmpoutdir)
    combined_mapping = {}
    result_centroids = []
    result_singletons_files = []
    # denoise each sample separately
    for i, sff_file in enumerate(sff_files):
        if not exists(tmpoutdir + ("/%d" % i)):
            makedirs(tmpoutdir + ("/%d" % i))
        out_fp = tmpoutdir + ("/%d/" % i)
        denoise_seqs([sff_file], fasta_fp, out_fp, None, cluster,
                     num_cpus, squeeze, percent_id, bail, primer,
                     low_cutoff, high_cutoff, log_fp, low_memory,
                     verbose, error_profile, max_num_rounds)

        # collect partial results
        this_rounds_mapping = read_denoiser_mapping(
            open(out_fp + "/denoiser_mapping.txt"))
        combined_mapping.update(this_rounds_mapping)
        result_centroids.append(
            parse_fasta(open(out_fp + "/centroids.fasta")))
        result_singletons_files.append(out_fp + "/singletons.fasta")

    # write the combined files
    store_mapping(combined_mapping, tmpoutdir, "denoiser")
    seqs = chain(*result_centroids)
    fasta_fh = open(tmpoutdir + "/denoised.fasta", "w")
    # write centroids sorted by clustersize
    write_Fasta_from_name_seq_pairs(
        sort_seqs_by_clustersize(seqs, combined_mapping),
        fasta_fh)
    for singleton_file in result_singletons_files:
        write_Fasta_from_name_seq_pairs(
            parse_fasta(open(singleton_file, "r")),
            fasta_fh)
    fasta_fh.close()

    # return outdir for tests/test_denoiser
    return tmpoutdir
Ejemplo n.º 8
0
def denoise_per_sample(sff_fps,
                       fasta_fp,
                       tmpoutdir,
                       cluster=False,
                       num_cpus=1,
                       squeeze=True,
                       percent_id=0.97,
                       bail=1,
                       primer="",
                       low_cutoff=3.75,
                       high_cutoff=4.5,
                       log_fp="denoiser.log",
                       low_memory=False,
                       verbose=False,
                       error_profile=DENOISER_DATA_DIR +
                       'FLX_error_profile.dat',
                       max_num_rounds=None,
                       titanium=False):
    """Denoise each sample separately"""

    # abort early if binary is missing
    check_flowgram_ali_exe()

    log_fh = None
    if log_fp:
        # switch of buffering for global log file
        log_fh = open(tmpoutdir + "/" + log_fp, "w", 0)

    # overwrite settings if titanium is set
    # This flag is only used from qiime. Remove after qiime integration
    if titanium:
        error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat"
        low_cutoff = 4
        high_cutoff = 5

    if verbose:
        log_fh.write("Denoiser version: %s\n" % __version__)
        log_fh.write("SFF files: %s\n" % ', '.join(sff_fps))
        log_fh.write("Fasta file: %s\n" % fasta_fp)
        log_fh.write("Cluster: %s\n" % cluster)
        log_fh.write("Num CPUs: %d\n" % num_cpus)
        log_fh.write("Squeeze Seqs: %s\n" % squeeze)
        log_fh.write("tmpdir: %s\n\n" % tmpoutdir)
        log_fh.write("percent_id threshold: %.2f\n" % percent_id)
        log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail)
        log_fh.write("Error profile: %s\n" % error_profile)
        log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds)

    # here we go ...
    sff_files = split_sff(map(open, sff_fps), open(fasta_fp), tmpoutdir)
    combined_mapping = {}
    result_centroids = []
    result_singletons_files = []
    # denoise each sample separately
    for i, sff_file in enumerate(sff_files):
        if not exists(tmpoutdir + ("/%d" % i)):
            makedirs(tmpoutdir + ("/%d" % i))
        out_fp = tmpoutdir + ("/%d/" % i)
        denoise_seqs([sff_file], fasta_fp, out_fp, None, cluster, num_cpus,
                     squeeze, percent_id, bail, primer, low_cutoff,
                     high_cutoff, log_fp, low_memory, verbose, error_profile,
                     max_num_rounds)

        # collect partial results
        this_rounds_mapping = read_denoiser_mapping(
            open(out_fp + "/denoiser_mapping.txt"))
        combined_mapping.update(this_rounds_mapping)
        result_centroids.append(parse_fasta(open(out_fp + "/centroids.fasta")))
        result_singletons_files.append(out_fp + "/singletons.fasta")

    # write the combined files
    store_mapping(combined_mapping, tmpoutdir, "denoiser")
    seqs = chain(*result_centroids)
    fasta_fh = open(tmpoutdir + "/denoised.fasta", "w")
    # write centroids sorted by clustersize
    write_Fasta_from_name_seq_pairs(
        sort_seqs_by_clustersize(seqs, combined_mapping), fasta_fh)
    for singleton_file in result_singletons_files:
        write_Fasta_from_name_seq_pairs(parse_fasta(open(singleton_file, "r")),
                                        fasta_fh)
    fasta_fh.close()

    # return outdir for tests/test_denoiser
    return tmpoutdir
Ejemplo n.º 9
0
def main():
    """run denoiser on input flowgrams"""
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    
    sff_files = opts.sff_fps

    for f in sff_files:
        if (not exists(f)):
            option_parser.error(('Flowgram file path does not exist:\n %s \n'+\
                                 'Pass a valid one via -i.')% f)
    outdir = opts.output_dir

    ret_val = create_dir(outdir, handle_errors_externally=True)
    if ret_val==1: #dir exists
        if opts.force:
            #do nothing, just overwrite content
            pass
        else:
            # Since the analysis can take quite a while, I put this check
            # in to help users avoid overwriting previous output.
            option_parser.error("Output directory already exists. Please choose"+\
                " a different directory, or force overwrite with -f.")

    else:
        handle_error_codes(outdir, error_code=ret_val)

    log_fh=None


    if (not (opts.primer or opts.map_fname)):
        raise ApplicationError, "Either mapping file or primer required"
    #Read primer from Meta data file if not set on command line
    if not opts.primer:
      mapping_data, header, comments = \
          parse_mapping_file(open(opts.map_fname,"U"))
        
      index = header.index("LinkerPrimerSequence")
      all_primers = set(array(mapping_data)[:,index])
      
      if len(all_primers)!= 1:
            raise ValueError,"Currently only data sets with one primer are allowed.\n"+\
                "Make separate mapping files with only one primer, re-run split_libraries and\n"\
                +"denoise with each split_library output separately."
      primer = list(all_primers)[0]
      last_char = primer[-1]
      if(last_char not in "ACGT"):
          raise ValueError,"We currently do not support primer with "+\
              "degenerate bases at it's 3' end."

    else:
        primer=opts.primer

    centroids, cluster_mapping = fast_denoiser(opts.sff_fps,opts.fasta_fp,
                                               outdir, opts.num_cpus, primer,
                                               titanium=opts.titanium)

    # store mapping file and centroids
    result_otu_path = '%s/denoised_clusters.txt' % outdir
    of = open(result_otu_path,'w')
    for i,cluster in cluster_mapping.iteritems():
        of.write('%s\t%s\n' % (str(i),'\t'.join(cluster)))
    of.close()
    
    result_fasta_path = '%s/denoised_seqs.fasta' % outdir
    oh = open(result_fasta_path,'w')
    write_Fasta_from_name_seq_pairs(centroids, oh)