Ejemplo n.º 1
0
    def test_filter_sff_file(self):
        """filter_sff_file filters out bad reads."""

        try:
            fh = open(self.tiny_test)
        except IOError:
            self.fail(
                "Could not open test file %s. Skipping test" %
                self.tiny_test)

        # With no filters all flowgram should be in out file
        flowgrams, header = lazy_parse_sff_handle(fh)
        filter_list = []
        fd, out_file_name = mkstemp(
            prefix="test_filter_sff_file",
            suffix=".sff.txt")
        close(fd)
        out_fh = open(out_file_name, "w")
        l = filter_sff_file(flowgrams, header, filter_list, out_fh)
        remove(out_file_name)
        fh.close()
        self.assertEqual(l, 114)

        # With good filters some should survive
        fh = open(self.tiny_test)
        flowgrams, header = lazy_parse_sff_handle(fh)
        filter_list = [lambda f:within_length(f, 100, 300)]
        fd, out_file_name = mkstemp(
            prefix="test_filter_sff_file",
            suffix=".sff.txt")
        close(fd)
        out_fh = open(out_file_name, "w")
        l = filter_sff_file(flowgrams, header, filter_list, out_fh)
        remove(out_file_name)
        fh.close()
        self.assertEqual(l, 112)

        # With strong filters nothing should be in
        fh = open(self.tiny_test)
        flowgrams, header = lazy_parse_sff_handle(fh)
        filter_list = [lambda f:within_length(f, 0, 0)]
        fd, out_file_name = mkstemp(
            prefix="test_filter_sff_file",
            suffix=".sff.txt")
        close(fd)
        out_fh = open(out_file_name, "w")
        l = filter_sff_file(flowgrams, header, filter_list, out_fh)
        remove(out_file_name)
        self.assertEqual(l, 0)
Ejemplo n.º 2
0
 def __iter__(self):
     # make it read_only and reset to start of file
     self.write_mode = False
     self.fh.close()
     (self.flowgrams, self.header) = lazy_parse_sff_handle(
         open(self.filename))
     return self.flowgrams
Ejemplo n.º 3
0
 def __iter__(self):
     # make it read_only and reset to start of file
     self.write_mode = False
     self.fh.close()
     (self.flowgrams,
      self.header) = lazy_parse_sff_handle(open(self.filename))
     return self.flowgrams
Ejemplo n.º 4
0
    def test_filter_sff_file(self):
        """filter_sff_file filters out bad reads."""

        try:
            fh = open(self.tiny_test)
        except IOError:
            self.fail("Could not open test file %s. Skipping test" %
                      self.tiny_test)

        # With no filters all flowgram should be in out file
        flowgrams, header = lazy_parse_sff_handle(fh)
        filter_list = []
        fd, out_file_name = mkstemp(prefix="test_filter_sff_file",
                                    suffix=".sff.txt")
        close(fd)
        out_fh = open(out_file_name, "w")
        l = filter_sff_file(flowgrams, header, filter_list, out_fh)
        remove(out_file_name)
        fh.close()
        self.assertEqual(l, 114)

        # With good filters some should survive
        fh = open(self.tiny_test)
        flowgrams, header = lazy_parse_sff_handle(fh)
        filter_list = [lambda f: within_length(f, 100, 300)]
        fd, out_file_name = mkstemp(prefix="test_filter_sff_file",
                                    suffix=".sff.txt")
        close(fd)
        out_fh = open(out_file_name, "w")
        l = filter_sff_file(flowgrams, header, filter_list, out_fh)
        remove(out_file_name)
        fh.close()
        self.assertEqual(l, 112)

        # With strong filters nothing should be in
        fh = open(self.tiny_test)
        flowgrams, header = lazy_parse_sff_handle(fh)
        filter_list = [lambda f: within_length(f, 0, 0)]
        fd, out_file_name = mkstemp(prefix="test_filter_sff_file",
                                    suffix=".sff.txt")
        close(fd)
        out_fh = open(out_file_name, "w")
        l = filter_sff_file(flowgrams, header, filter_list, out_fh)
        remove(out_file_name)
        self.assertEqual(l, 0)
Ejemplo n.º 5
0
def build_averaged_flowgrams(mapping, sff_fp,
                             min_coverage=50, out_fp=None):
    """Build averaged flowgrams for each cluster in mapping.

    mapping: a cluster mapping as dictionary of lists

    sff_fp: pointer to sff.txt file, must be consistent with  mapping

    min_coverage: number of flowgrams to average over for each cluster

    out_fp: ouput file name

    NOTE: This function has no test code, since it is mostly IO around tested functions
    """

    l = len(mapping)
    (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp))
    # update some values in the sff header
    header["# of Reads"] = l
    header["Index Length"] = "NA"

    if (out_fp):
        out_filename = out_fp
    else:
        fd, out_filename = mkstemp(dir="/tmp/",
                                  prefix="prefix_dereplicated",
                                  suffix=".sff.txt")
        close(fd)
    outhandle = open(out_filename, "w")

    # write out reduced flogram set
    write_sff_header(header, outhandle)

    seqs = {}
    # get a random sample for each cluster
    sample_keys = sample_mapped_keys(mapping, min_coverage)
    for ave_f, id in _average_flowgrams(mapping, flowgrams, sample_keys):
        outhandle.write(ave_f.createFlowHeader() + "\n")
        ave_f.Bases = ave_f.toSeq()
        seqs[id] = ave_f.Bases

    outhandle.close()
    return(out_filename, seqs)
Ejemplo n.º 6
0
def secondary_clustering(sff_file,
                         mapping,
                         bestscores,
                         log_fh,
                         threshold=4.5,
                         verbose=False):
    """Clusters sequences based on their best distance to any of the centroids.

    Does not actually compute distances but uses the results of the first
    phase stored in bestscores.


    sff_file: name of unclustered flowgram file

    mapping: preliminary mapping file, dictionary of ids to list of ids

    bestscores: dictionary that stores for each unclustered flowgram the best
             score it has to to one of the centroid previously seen
             and the id of the centroid. Used in the second denoising phase.

    threshold: Secondary clustering threshold.

    """
    if (len(bestscores) == 0):
        # Either all sequence are already clustered or
        # we had no seq exceeding the bail out limit
        return

    (flowgrams, header) = lazy_parse_sff_handle(open(sff_file))

    counter = 0
    for f in flowgrams:
        (id, score) = bestscores[f.Name]
        if (score < threshold):
            counter += 1
            # update the mapping information
            mapping[id].extend(mapping[f.Name])
            mapping[id].append(f.Name)
            del (mapping[f.Name])
    if verbose:
        log_fh.write("Secondary clustering removed %d flowgrams\n" % counter)
Ejemplo n.º 7
0
def store_clusters(mapping, sff_fp, outdir="/tmp/", store_members=False):
    """Stores fasta and flogram file for each cluster."""

    # get mapping read to cluster
    invert_map = invert_mapping(mapping)
    (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp))

    leftover_fasta_fh = open(outdir + "/singletons.fasta", "w")
    centroids = []
    for f in flowgrams:
        try:
            key = invert_map[f.Name]
        except KeyError:
            # this flowgram has not been clustered
            continue
        if (len(mapping[key]) == 0):
            # do not store singletons in a separate cluster
            leftover_fasta_fh.write(f.toFasta() + "\n")
            continue
        elif(f.Name in mapping):
            # save as a centroid
            centroids.append((len(mapping[f.Name]) + 1, f.Name, f.toSeq()))

        if (store_members):
            flows_fh = open(outdir + key + ".flows", "a")
            fasta_fh = open(outdir + key + ".fasta", "a")
            flows_fh.write("%s\n" % f)
            fasta_fh.write(f.toFasta() + "\n")
            fasta_fh.close()
            flows_fh.close()

    leftover_fasta_fh.close()

    # sort and store ordered by cluster_size
    centroids.sort(reverse=True)
    centroid_fh = open(outdir + "/centroids.fasta", "w")
    for size, name, seq in centroids:
        centroid_fh.write(">%s | cluster size: %d \n%s\n" %
                          (name, size, seq))
    centroid_fh.close()
Ejemplo n.º 8
0
def store_clusters(mapping, sff_fp, outdir="/tmp/", store_members=False):
    """Stores fasta and flogram file for each cluster."""

    # get mapping read to cluster
    invert_map = invert_mapping(mapping)
    (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp))

    leftover_fasta_fh = open(outdir + "/singletons.fasta", "w")
    centroids = []
    for f in flowgrams:
        try:
            key = invert_map[f.Name]
        except KeyError:
            # this flowgram has not been clustered
            continue
        if (len(mapping[key]) == 0):
            # do not store singletons in a separate cluster
            leftover_fasta_fh.write(f.toFasta() + "\n")
            continue
        elif (f.Name in mapping):
            # save as a centroid
            centroids.append((len(mapping[f.Name]) + 1, f.Name, f.toSeq()))

        if (store_members):
            flows_fh = open(outdir + key + ".flows", "a")
            fasta_fh = open(outdir + key + ".fasta", "a")
            flows_fh.write("%s\n" % f)
            fasta_fh.write(f.toFasta() + "\n")
            fasta_fh.close()
            flows_fh.close()

    leftover_fasta_fh.close()

    # sort and store ordered by cluster_size
    centroids.sort(reverse=True)
    centroid_fh = open(outdir + "/centroids.fasta", "w")
    for size, name, seq in centroids:
        centroid_fh.write(">%s | cluster size: %d \n%s\n" % (name, size, seq))
    centroid_fh.close()
Ejemplo n.º 9
0
def secondary_clustering(sff_file, mapping, bestscores, log_fh,
                         threshold=4.5, verbose=False):
    """Clusters sequences based on their best distance to any of the centroids.

    Does not actually compute distances but uses the results of the first
    phase stored in bestscores.


    sff_file: name of unclustered flowgram file

    mapping: preliminary mapping file, dictionary of ids to list of ids

    bestscores: dictionary that stores for each unclustered flowgram the best
             score it has to to one of the centroid previously seen
             and the id of the centroid. Used in the second denoising phase.

    threshold: Secondary clustering threshold.

    """
    if(len(bestscores) == 0):
        # Either all sequence are already clustered or
        # we had no seq exceeding the bail out limit
        return

    (flowgrams, header) = lazy_parse_sff_handle(open(sff_file))

    counter = 0
    for f in flowgrams:
        (id, score) = bestscores[f.Name]
        if (score < threshold):
            counter += 1
            # update the mapping information
            mapping[id].extend(mapping[f.Name])
            mapping[id].append(f.Name)
            del(mapping[f.Name])
    if verbose:
        log_fh.write("Secondary clustering removed %d flowgrams\n" % counter)
Ejemplo n.º 10
0
def greedy_clustering(sff_fp, seqs, cluster_mapping, outdir, num_flows,
                      log_fh, num_cpus=1, on_cluster=False,
                      bail_out=1, pair_id_thresh=0.97, verbose=False,
                      threshold=3.75, fast_method=True,
                      error_profile=DENOISER_DATA_DIR +
                      'FLX_error_profile.dat',
                      max_num_rounds=None, checkpoint_fp=None):
    """second clustering phase of denoiser.

    sff_fp: flowgram file
    seqs: fasta seqs corresponding to sff_fp
    cluster_mapping: preliminary cluster mapping from phase I
    outdir: output directory
    num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp)
    log_fh: write verbose info to log_fh if set
    num_cpus:number of cpus to use of on_cluster ==True
    on_cluster: run in paralell if True
    bail_out: stop clustering with first cluster having bail_out members
    pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq
                     identity of pair_id_thresh or higher
    verbose: be verbose or not
    threshold: low clustering threshold for phase II
    fast_method: use more memory intensive but faster method
    error_profile: path to error profile *.dat file
    max_num_rounds: If set, will stop clustering after this many rounds
    """

    (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp))
    l = num_flows

    spread = [1.0 for x in range(num_cpus)]
    (client_sockets, workers) = (None, None)
    if on_cluster:
        (client_sockets, workers, server_socket) = \
            setup_cluster(num_cpus, outdir, verbose, error_profile)

    if checkpoint_fp:
        (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \
            read_checkpoint(checkpoint_fp)
        skipping = True
    else:
        # ids stores all the active sequences
        # we initialize it with the ids from  the seqs dict here,
        # as it starts with all active flows.
        ids = dict.fromkeys(seqs)

        sorted_keys = sort_mapping_by_size(cluster_mapping)

        bestscores = {}
        round_ctr = 1

    # this is the main clustering loop, where most of the compute time is spent
    for key in sorted_keys:
        # skip until we reach the checkpoint
        if checkpoint_fp:
            if (checkpoint_key == key):
                if log_fh:
                    log_fh.write("Resume denoising with %s\n" % key)
                skipping = False
            if (skipping):
                continue

        if(key not in cluster_mapping):
            # this guy already has been clustered
            continue

        if (max_num_rounds and round_ctr > max_num_rounds):
            if log_fh:
                log_fh.write("Max number of rounds reached. " +
                             "Aborting clustering phase II and continuing with phase III.\n")
            break

        prefix_clustersize = len(cluster_mapping[key])
        # abort greedy first phase
        if(prefix_clustersize < bail_out):
            break

        # Do not take bad sequences as cluster seeds, as this will break the
        # code
        if('N' in seqs[key]):
            continue

        # check and delete workers if no longer needed
        if on_cluster:
            num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh)
            # check for dead workers
            check_workers(workers, client_sockets, log_fh)
            if num_cpus != len(spread):
                spread = [1.0 for x in range(num_cpus)]

        # write checkpoint right before expensive computation starts
        # Currently, write checkpint every 50 rounds,
        # could easily be changed here or exposed to command line
        if (round_ctr % 50) == 0:
            write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores,
                             sorted_keys, outdir)

        if log_fh:
            log_fh.write("Round %d:\n" % round_ctr)
            log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh)

        ideal_flow = seq_to_flow(seqs[key])
        (new_flowgrams, newl) = filter_with_flowgram(key, ideal_flow, flowgrams, header, ids,
                                                     l, bestscores, log_fh, outdir,
                                                     on_cluster=on_cluster,
                                                     num_cpus=num_cpus,
                                                     fast_method=fast_method,
                                                     mapping=cluster_mapping,
                                                     verbose=verbose,
                                                     threshold=threshold,
                                                     pair_id_thresh=pair_id_thresh,
                                                     client_sockets=client_sockets,
                                                     error_profile=error_profile, spread=spread)
        l = newl
        flowgrams = new_flowgrams
        round_ctr += 1
        if(newl == 0):
            # all flowgrams clustered
            break
         # JR: I think this is too much info for the regular user, I leave it in, so
        # we can simply turn it on for debugging
#        if log_fh:
#            log_fh.write("Throughput Spread %s\n" % str(spread))

    if on_cluster:
        stop_workers(client_sockets, log_fh)
        server_socket.close()

    # write all remaining flowgrams into file for next step
    # TODO: might use abstract FlowgramContainer here as well
    fd, non_clustered_filename = mkstemp(dir=outdir, prefix="ff",
                                        suffix=".sff.txt")
    close(fd)
    non_clustered_fh = open(non_clustered_filename, "w")
    write_sff_header(header, non_clustered_fh)
    for f in flowgrams:
        if (f.Name in ids):
            non_clustered_fh.write(f.createFlowHeader() + "\n")

    return(non_clustered_filename, bestscores, cluster_mapping)
Ejemplo n.º 11
0
def greedy_clustering(sff_fp,
                      seqs,
                      cluster_mapping,
                      outdir,
                      num_flows,
                      log_fh,
                      num_cpus=1,
                      on_cluster=False,
                      bail_out=1,
                      pair_id_thresh=0.97,
                      verbose=False,
                      threshold=3.75,
                      fast_method=True,
                      error_profile=DENOISER_DATA_DIR +
                      'FLX_error_profile.dat',
                      max_num_rounds=None,
                      checkpoint_fp=None):
    """second clustering phase of denoiser.

    sff_fp: flowgram file
    seqs: fasta seqs corresponding to sff_fp
    cluster_mapping: preliminary cluster mapping from phase I
    outdir: output directory
    num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp)
    log_fh: write verbose info to log_fh if set
    num_cpus:number of cpus to use of on_cluster ==True
    on_cluster: run in paralell if True
    bail_out: stop clustering with first cluster having bail_out members
    pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq
                     identity of pair_id_thresh or higher
    verbose: be verbose or not
    threshold: low clustering threshold for phase II
    fast_method: use more memory intensive but faster method
    error_profile: path to error profile *.dat file
    max_num_rounds: If set, will stop clustering after this many rounds
    """

    (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp))
    l = num_flows

    spread = [1.0 for x in range(num_cpus)]
    (client_sockets, workers) = (None, None)
    if on_cluster:
        (client_sockets, workers, server_socket) = \
            setup_cluster(num_cpus, outdir, verbose, error_profile)

    if checkpoint_fp:
        (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \
            read_checkpoint(checkpoint_fp)
        skipping = True
    else:
        # ids stores all the active sequences
        # we initialize it with the ids from  the seqs dict here,
        # as it starts with all active flows.
        ids = dict.fromkeys(seqs)

        sorted_keys = sort_mapping_by_size(cluster_mapping)

        bestscores = {}
        round_ctr = 1

    # this is the main clustering loop, where most of the compute time is spent
    for key in sorted_keys:
        # skip until we reach the checkpoint
        if checkpoint_fp:
            if (checkpoint_key == key):
                if log_fh:
                    log_fh.write("Resume denoising with %s\n" % key)
                skipping = False
            if (skipping):
                continue

        if (key not in cluster_mapping):
            # this guy already has been clustered
            continue

        if (max_num_rounds and round_ctr > max_num_rounds):
            if log_fh:
                log_fh.write(
                    "Max number of rounds reached. " +
                    "Aborting clustering phase II and continuing with phase III.\n"
                )
            break

        prefix_clustersize = len(cluster_mapping[key])
        # abort greedy first phase
        if (prefix_clustersize < bail_out):
            break

        # Do not take bad sequences as cluster seeds, as this will break the
        # code
        if ('N' in seqs[key]):
            continue

        # check and delete workers if no longer needed
        if on_cluster:
            num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh)
            # check for dead workers
            check_workers(workers, client_sockets, log_fh)
            if num_cpus != len(spread):
                spread = [1.0 for x in range(num_cpus)]

        # write checkpoint right before expensive computation starts
        # Currently, write checkpint every 50 rounds,
        # could easily be changed here or exposed to command line
        if (round_ctr % 50) == 0:
            write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores,
                             sorted_keys, outdir)

        if log_fh:
            log_fh.write("Round %d:\n" % round_ctr)
            log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh)

        ideal_flow = seq_to_flow(seqs[key])
        (new_flowgrams,
         newl) = filter_with_flowgram(key,
                                      ideal_flow,
                                      flowgrams,
                                      header,
                                      ids,
                                      l,
                                      bestscores,
                                      log_fh,
                                      outdir,
                                      on_cluster=on_cluster,
                                      num_cpus=num_cpus,
                                      fast_method=fast_method,
                                      mapping=cluster_mapping,
                                      verbose=verbose,
                                      threshold=threshold,
                                      pair_id_thresh=pair_id_thresh,
                                      client_sockets=client_sockets,
                                      error_profile=error_profile,
                                      spread=spread)
        l = newl
        flowgrams = new_flowgrams
        round_ctr += 1
        if (newl == 0):
            # all flowgrams clustered
            break
        # JR: I think this is too much info for the regular user, I leave it in, so
        # we can simply turn it on for debugging


#        if log_fh:
#            log_fh.write("Throughput Spread %s\n" % str(spread))

    if on_cluster:
        stop_workers(client_sockets, log_fh)
        server_socket.close()

    # write all remaining flowgrams into file for next step
    # TODO: might use abstract FlowgramContainer here as well
    fd, non_clustered_filename = mkstemp(dir=outdir,
                                         prefix="ff",
                                         suffix=".sff.txt")
    close(fd)
    non_clustered_fh = open(non_clustered_filename, "w")
    write_sff_header(header, non_clustered_fh)
    for f in flowgrams:
        if (f.Name in ids):
            non_clustered_fh.write(f.createFlowHeader() + "\n")

    return (non_clustered_filename, bestscores, cluster_mapping)
Ejemplo n.º 12
0
def preprocess(sff_fps, log_fh, fasta_fp=None, out_fp="/tmp/",
               verbose=False, squeeze=False,
               primer=STANDARD_BACTERIAL_PRIMER):
    """Quality filtering and truncation of flowgrams, followed by denoiser phase I.

    sff_fps: List of paths to flowgram files

    log_fh: log messages are written to log_fh if it is set to something else than None

    fasta_fp: Path to fasta file, formatted as from split_libraries.py.
              This files is used to filter the flowgrams in sff_fps. Only reads in
              fasta_fp are pulled from sff_fps.

    out_fp: path to output directory

    verbose: a binary verbose flag

    squeeze: a flag that controls if sequences are squeezed before phase I.
             Squeezing means consecutive identical nucs are collapsed to one.

    primer: The primer sequences of the amplification process. This seq will be
            removed from all reads during the preprocessing
    """
    flowgrams, header = cat_sff_files(map(open, sff_fps))

    if(fasta_fp):
        # remove barcodes and sequences tossed by split_libraries, i.e. not in
        # fasta_fp
        labels = imap(lambda a_b: a_b[0], parse_fasta(open(fasta_fp)))
        barcode_mapping = extract_barcodes_from_mapping(labels)
        (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header,
                                                      outdir=out_fp,
                                                      barcode_mapping=barcode_mapping,
                                                      primer=primer)
        if verbose:
            log_fh.write(
                "Sequences in barcode mapping: %d\n" %
                len(barcode_mapping))
            log_fh.write("Truncated flowgrams written: %d\n" % l)
    else:
        # just do a simple clean and truncate
        (clean_sff_fp, l) = cleanup_sff(flowgrams, header, outdir=out_fp)
        if verbose:
            log_fh.write("Cleaned flowgrams written: %d\n" % l)
        flowgrams, header = lazy_parse_sff_handle(open(clean_sff_fp))
        (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header,
                                                      outdir=out_fp, primer=primer)
        if verbose:
            log_fh.write("Truncated flowgrams written: %d\n" % l)
        remove(clean_sff_fp)

    if (l == 0):
        raise ValueError("No flowgrams left after preprocesing.\n" +
                         "Check your primer sequence")

    # Phase I - cluster seqs which are exact prefixe
    if verbose:
        log_fh.write("Filter flowgrams by prefix matching\n")

    (flowgrams, header) = lazy_parse_sff_handle(open(trunc_sff_fp))
    l, orig_l, mapping =\
        prefix_filter_flowgrams(flowgrams, squeeze=squeeze)

    averaged_sff_fp, seqs = build_averaged_flowgrams(mapping, trunc_sff_fp,
                                                     min_coverage=1,
                                                     # averaging produces too good flowgrams
                                                     # such that the greedy clustering clusters too much.
                                                     # Use the cluster centroid
                                                     # instead by using
                                                     # min_coverage 1
                                                     out_fp=out_fp + "/prefix_dereplicated.sff.txt")
    remove(trunc_sff_fp)
    if verbose:
        log_fh.write("Prefix matching: removed %d out of %d seqs\n"
                     % (orig_l - l, orig_l))
        log_fh.write("Remaining number of sequences: %d\n" % l)
        log_fh.write(make_stats(mapping) + "\n")

    # print representative sequences and mapping
    print_rep_seqs(mapping, seqs, out_fp)
    store_mapping(mapping, out_fp, "prefix")
    return (averaged_sff_fp, l, mapping, seqs)