Esempio n. 1
0
    def test_checkpoints(self):
        """storing and loading of checkpoints works"""

        self.tmpdir = get_tmp_filename(
            tmp_dir="./",
            suffix="_test_checkpoints/")

        bestscores = dict({1: 0.9,
                           2: 1.1,
                           3: 2.3,
                           4: 99.93232344})

        out_fp = write_checkpoint(
            "Key", 99, self.mapping, [1, 2, 3, 4], bestscores,
            [2, 1, 3, 4],
            self.tmpdir)

        observed = read_checkpoint(out_fp)

        self.assertEqual(observed[0], "Key")
        self.assertEqual(observed[1], 99)
        self.assertEqual(observed[2], self.mapping)
        self.assertEqual(observed[3], [1, 2, 3, 4])
        self.assertEqual(observed[4], bestscores)
        self.assertEqual(observed[5], [2, 1, 3, 4])
Esempio n. 2
0
    def test_checkpoints(self):
        """storing and loading of checkpoints works"""

        self.tmpdir = mkdtemp(dir="./", suffix="_test_checkpoints/")

        bestscores = dict({1: 0.9, 2: 1.1, 3: 2.3, 4: 99.93232344})

        out_fp = write_checkpoint("Key", 99, self.mapping, [1, 2, 3, 4],
                                  bestscores, [2, 1, 3, 4], self.tmpdir)

        observed = read_checkpoint(out_fp)

        self.assertEqual(observed[0], "Key")
        self.assertEqual(observed[1], 99)
        self.assertEqual(observed[2], self.mapping)
        self.assertEqual(observed[3], [1, 2, 3, 4])
        self.assertEqual(observed[4], bestscores)
        self.assertEqual(observed[5], [2, 1, 3, 4])
Esempio n. 3
0
def greedy_clustering(sff_fp, seqs, cluster_mapping, outdir, num_flows,
                      log_fh, num_cpus=1, on_cluster=False,
                      bail_out=1, pair_id_thresh=0.97, verbose=False,
                      threshold=3.75, fast_method=True,
                      error_profile=DENOISER_DATA_DIR +
                      'FLX_error_profile.dat',
                      max_num_rounds=None, checkpoint_fp=None):
    """second clustering phase of denoiser.

    sff_fp: flowgram file
    seqs: fasta seqs corresponding to sff_fp
    cluster_mapping: preliminary cluster mapping from phase I
    outdir: output directory
    num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp)
    log_fh: write verbose info to log_fh if set
    num_cpus:number of cpus to use of on_cluster ==True
    on_cluster: run in paralell if True
    bail_out: stop clustering with first cluster having bail_out members
    pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq
                     identity of pair_id_thresh or higher
    verbose: be verbose or not
    threshold: low clustering threshold for phase II
    fast_method: use more memory intensive but faster method
    error_profile: path to error profile *.dat file
    max_num_rounds: If set, will stop clustering after this many rounds
    """

    (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp))
    l = num_flows

    spread = [1.0 for x in range(num_cpus)]
    (client_sockets, workers) = (None, None)
    if on_cluster:
        (client_sockets, workers, server_socket) = \
            setup_cluster(num_cpus, outdir, verbose, error_profile)

    if checkpoint_fp:
        (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \
            read_checkpoint(checkpoint_fp)
        skipping = True
    else:
        # ids stores all the active sequences
        # we initialize it with the ids from  the seqs dict here,
        # as it starts with all active flows.
        ids = dict.fromkeys(seqs)

        sorted_keys = sort_mapping_by_size(cluster_mapping)

        bestscores = {}
        round_ctr = 1

    # this is the main clustering loop, where most of the compute time is spent
    for key in sorted_keys:
        # skip until we reach the checkpoint
        if checkpoint_fp:
            if (checkpoint_key == key):
                if log_fh:
                    log_fh.write("Resume denoising with %s\n" % key)
                skipping = False
            if (skipping):
                continue

        if(key not in cluster_mapping):
            # this guy already has been clustered
            continue

        if (max_num_rounds and round_ctr > max_num_rounds):
            if log_fh:
                log_fh.write("Max number of rounds reached. " +
                             "Aborting clustering phase II and continuing with phase III.\n")
            break

        prefix_clustersize = len(cluster_mapping[key])
        # abort greedy first phase
        if(prefix_clustersize < bail_out):
            break

        # Do not take bad sequences as cluster seeds, as this will break the
        # code
        if('N' in seqs[key]):
            continue

        # check and delete workers if no longer needed
        if on_cluster:
            num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh)
            # check for dead workers
            check_workers(workers, client_sockets, log_fh)
            if num_cpus != len(spread):
                spread = [1.0 for x in range(num_cpus)]

        # write checkpoint right before expensive computation starts
        # Currently, write checkpint every 50 rounds,
        # could easily be changed here or exposed to command line
        if (round_ctr % 50) == 0:
            write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores,
                             sorted_keys, outdir)

        if log_fh:
            log_fh.write("Round %d:\n" % round_ctr)
            log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh)

        ideal_flow = seq_to_flow(seqs[key])
        (new_flowgrams, newl) = filter_with_flowgram(key, ideal_flow, flowgrams, header, ids,
                                                     l, bestscores, log_fh, outdir,
                                                     on_cluster=on_cluster,
                                                     num_cpus=num_cpus,
                                                     fast_method=fast_method,
                                                     mapping=cluster_mapping,
                                                     verbose=verbose,
                                                     threshold=threshold,
                                                     pair_id_thresh=pair_id_thresh,
                                                     client_sockets=client_sockets,
                                                     error_profile=error_profile, spread=spread)
        l = newl
        flowgrams = new_flowgrams
        round_ctr += 1
        if(newl == 0):
            # all flowgrams clustered
            break
         # JR: I think this is too much info for the regular user, I leave it in, so
        # we can simply turn it on for debugging
#        if log_fh:
#            log_fh.write("Throughput Spread %s\n" % str(spread))

    if on_cluster:
        stop_workers(client_sockets, log_fh)
        server_socket.close()

    # write all remaining flowgrams into file for next step
    # TODO: might use abstract FlowgramContainer here as well
    fd, non_clustered_filename = mkstemp(dir=outdir, prefix="ff",
                                        suffix=".sff.txt")
    close(fd)
    non_clustered_fh = open(non_clustered_filename, "w")
    write_sff_header(header, non_clustered_fh)
    for f in flowgrams:
        if (f.Name in ids):
            non_clustered_fh.write(f.createFlowHeader() + "\n")

    return(non_clustered_filename, bestscores, cluster_mapping)
Esempio n. 4
0
def greedy_clustering(sff_fp,
                      seqs,
                      cluster_mapping,
                      outdir,
                      num_flows,
                      log_fh,
                      num_cpus=1,
                      on_cluster=False,
                      bail_out=1,
                      pair_id_thresh=0.97,
                      verbose=False,
                      threshold=3.75,
                      fast_method=True,
                      error_profile=DENOISER_DATA_DIR +
                      'FLX_error_profile.dat',
                      max_num_rounds=None,
                      checkpoint_fp=None):
    """second clustering phase of denoiser.

    sff_fp: flowgram file
    seqs: fasta seqs corresponding to sff_fp
    cluster_mapping: preliminary cluster mapping from phase I
    outdir: output directory
    num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp)
    log_fh: write verbose info to log_fh if set
    num_cpus:number of cpus to use of on_cluster ==True
    on_cluster: run in paralell if True
    bail_out: stop clustering with first cluster having bail_out members
    pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq
                     identity of pair_id_thresh or higher
    verbose: be verbose or not
    threshold: low clustering threshold for phase II
    fast_method: use more memory intensive but faster method
    error_profile: path to error profile *.dat file
    max_num_rounds: If set, will stop clustering after this many rounds
    """

    (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp))
    l = num_flows

    spread = [1.0 for x in range(num_cpus)]
    (client_sockets, workers) = (None, None)
    if on_cluster:
        (client_sockets, workers, server_socket) = \
            setup_cluster(num_cpus, outdir, verbose, error_profile)

    if checkpoint_fp:
        (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \
            read_checkpoint(checkpoint_fp)
        skipping = True
    else:
        # ids stores all the active sequences
        # we initialize it with the ids from  the seqs dict here,
        # as it starts with all active flows.
        ids = dict.fromkeys(seqs)

        sorted_keys = sort_mapping_by_size(cluster_mapping)

        bestscores = {}
        round_ctr = 1

    # this is the main clustering loop, where most of the compute time is spent
    for key in sorted_keys:
        # skip until we reach the checkpoint
        if checkpoint_fp:
            if (checkpoint_key == key):
                if log_fh:
                    log_fh.write("Resume denoising with %s\n" % key)
                skipping = False
            if (skipping):
                continue

        if (key not in cluster_mapping):
            # this guy already has been clustered
            continue

        if (max_num_rounds and round_ctr > max_num_rounds):
            if log_fh:
                log_fh.write(
                    "Max number of rounds reached. " +
                    "Aborting clustering phase II and continuing with phase III.\n"
                )
            break

        prefix_clustersize = len(cluster_mapping[key])
        # abort greedy first phase
        if (prefix_clustersize < bail_out):
            break

        # Do not take bad sequences as cluster seeds, as this will break the
        # code
        if ('N' in seqs[key]):
            continue

        # check and delete workers if no longer needed
        if on_cluster:
            num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh)
            # check for dead workers
            check_workers(workers, client_sockets, log_fh)
            if num_cpus != len(spread):
                spread = [1.0 for x in range(num_cpus)]

        # write checkpoint right before expensive computation starts
        # Currently, write checkpint every 50 rounds,
        # could easily be changed here or exposed to command line
        if (round_ctr % 50) == 0:
            write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores,
                             sorted_keys, outdir)

        if log_fh:
            log_fh.write("Round %d:\n" % round_ctr)
            log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh)

        ideal_flow = seq_to_flow(seqs[key])
        (new_flowgrams,
         newl) = filter_with_flowgram(key,
                                      ideal_flow,
                                      flowgrams,
                                      header,
                                      ids,
                                      l,
                                      bestscores,
                                      log_fh,
                                      outdir,
                                      on_cluster=on_cluster,
                                      num_cpus=num_cpus,
                                      fast_method=fast_method,
                                      mapping=cluster_mapping,
                                      verbose=verbose,
                                      threshold=threshold,
                                      pair_id_thresh=pair_id_thresh,
                                      client_sockets=client_sockets,
                                      error_profile=error_profile,
                                      spread=spread)
        l = newl
        flowgrams = new_flowgrams
        round_ctr += 1
        if (newl == 0):
            # all flowgrams clustered
            break
        # JR: I think this is too much info for the regular user, I leave it in, so
        # we can simply turn it on for debugging


#        if log_fh:
#            log_fh.write("Throughput Spread %s\n" % str(spread))

    if on_cluster:
        stop_workers(client_sockets, log_fh)
        server_socket.close()

    # write all remaining flowgrams into file for next step
    # TODO: might use abstract FlowgramContainer here as well
    fd, non_clustered_filename = mkstemp(dir=outdir,
                                         prefix="ff",
                                         suffix=".sff.txt")
    close(fd)
    non_clustered_fh = open(non_clustered_filename, "w")
    write_sff_header(header, non_clustered_fh)
    for f in flowgrams:
        if (f.Name in ids):
            non_clustered_fh.write(f.createFlowHeader() + "\n")

    return (non_clustered_filename, bestscores, cluster_mapping)