Example #1
0
    def test_stop_workers(self):
        """stop_workers terminates all clients"""

        workers, client_sockets = self._setup_server_and_clients()

        stop_workers(client_sockets)
        for client_socket in client_sockets:
            self.assertRaises(error, client_socket.send, "hello")
Example #2
0
    def test_stop_workers(self):
        """stop_workers terminates all clients"""

        workers, client_sockets = self._setup_server_and_clients()

        stop_workers(client_sockets)
        for client_socket in client_sockets:
            self.assertRaises(error, client_socket.send, "hello")
Example #3
0
    def test_stop_workers_on_closed_socket(self):
        """stop_workers terminates all clients"""

        # Repeat test but this time close one of the sockets early.
        # simulates crashed client
        workers, client_sockets = self._setup_server_and_clients()
        client_sockets[-1].close()
        fake_fh = StringIO()
        stop_workers(client_sockets, fake_fh)
        self.assertEqual(fake_fh.getvalue(), "Worker 3 seems to be dead already. Check for runaways!\n")
        for client_socket in client_sockets:
            self.assertRaises(error, client_socket.send, "hello")
Example #4
0
    def test_stop_workers_on_closed_socket(self):
        """stop_workers terminates all clients"""

        # Repeat test but this time close one of the sockets early.
        # simulates crashed client
        workers, client_sockets = self._setup_server_and_clients()
        client_sockets[-1].close()
        fake_fh = StringIO()
        stop_workers(client_sockets, fake_fh)
        self.assertEqual(fake_fh.getvalue(),
                         "Worker 3 seems to be dead already. Check for runaways!\n")
        for client_socket in client_sockets:
            self.assertRaises(error, client_socket.send, "hello")
    def test_get_flowgram_distances_on_cluster(self):
        """get_flowgram_distances_on_cluster computes the correct alignment score."""

        self.tmp_dir = mkdtemp(dir="./", suffix="/")
        # setup server and workers
        self.socket = setup_server()
        workers, client_sockets = setup_workers(1,
                                                self.tmp_dir,
                                                self.socket,
                                                verbose=False)
        client_sockets = [a for a, b in client_sockets]
        scores, names, fc = get_flowgram_distances_on_cluster(
            "1", self.flowgram, self.flowgrams, FlowgramContainerArray(),
            {"FZTHQMS01CIW5N": ""}, 1, 1, [1], client_sockets)
        stop_workers(client_sockets)

        self.assertEqual(names, ["FZTHQMS01CIW5N"])
        assert_almost_equal(scores, [[4.95274923, 0.7815385]], decimal=4)
 def test_get_flowgram_distances_on_cluster(self):
     """get_flowgram_distances_on_cluster computes the correct alignment score."""
     
     self.tmp_dir=get_tmp_filename(tmp_dir = "./", suffix="/")
     mkdir(self.tmp_dir)
     #setup server and workers
     self.socket = setup_server()
     workers, client_sockets = setup_workers(1, self.tmp_dir, self.socket,
                                             verbose=False)
     client_sockets = [a for a,b in client_sockets]
     scores, names, fc = get_flowgram_distances_on_cluster("1", self.flowgram,
                                                           self.flowgrams,
                                                           FlowgramContainerArray(),
                                                           {"FZTHQMS01CIW5N":""},
                                                           1, 1, [1],
                                                           client_sockets)
     stop_workers(client_sockets)
 
     self.assertEqual(names, ["FZTHQMS01CIW5N"])
     self.assertFloatEqual(scores, [4.95274923, 0.7815385])
Example #7
0
def greedy_clustering(sff_fp, seqs, cluster_mapping, outdir, num_flows,
                      log_fh, num_cpus=1, on_cluster=False,
                      bail_out=1, pair_id_thresh=0.97, verbose=False,
                      threshold=3.75, fast_method=True,
                      error_profile=DENOISER_DATA_DIR +
                      'FLX_error_profile.dat',
                      max_num_rounds=None, checkpoint_fp=None):
    """second clustering phase of denoiser.

    sff_fp: flowgram file
    seqs: fasta seqs corresponding to sff_fp
    cluster_mapping: preliminary cluster mapping from phase I
    outdir: output directory
    num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp)
    log_fh: write verbose info to log_fh if set
    num_cpus:number of cpus to use of on_cluster ==True
    on_cluster: run in paralell if True
    bail_out: stop clustering with first cluster having bail_out members
    pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq
                     identity of pair_id_thresh or higher
    verbose: be verbose or not
    threshold: low clustering threshold for phase II
    fast_method: use more memory intensive but faster method
    error_profile: path to error profile *.dat file
    max_num_rounds: If set, will stop clustering after this many rounds
    """

    (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp))
    l = num_flows

    spread = [1.0 for x in range(num_cpus)]
    (client_sockets, workers) = (None, None)
    if on_cluster:
        (client_sockets, workers, server_socket) = \
            setup_cluster(num_cpus, outdir, verbose, error_profile)

    if checkpoint_fp:
        (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \
            read_checkpoint(checkpoint_fp)
        skipping = True
    else:
        # ids stores all the active sequences
        # we initialize it with the ids from  the seqs dict here,
        # as it starts with all active flows.
        ids = dict.fromkeys(seqs)

        sorted_keys = sort_mapping_by_size(cluster_mapping)

        bestscores = {}
        round_ctr = 1

    # this is the main clustering loop, where most of the compute time is spent
    for key in sorted_keys:
        # skip until we reach the checkpoint
        if checkpoint_fp:
            if (checkpoint_key == key):
                if log_fh:
                    log_fh.write("Resume denoising with %s\n" % key)
                skipping = False
            if (skipping):
                continue

        if(key not in cluster_mapping):
            # this guy already has been clustered
            continue

        if (max_num_rounds and round_ctr > max_num_rounds):
            if log_fh:
                log_fh.write("Max number of rounds reached. " +
                             "Aborting clustering phase II and continuing with phase III.\n")
            break

        prefix_clustersize = len(cluster_mapping[key])
        # abort greedy first phase
        if(prefix_clustersize < bail_out):
            break

        # Do not take bad sequences as cluster seeds, as this will break the
        # code
        if('N' in seqs[key]):
            continue

        # check and delete workers if no longer needed
        if on_cluster:
            num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh)
            # check for dead workers
            check_workers(workers, client_sockets, log_fh)
            if num_cpus != len(spread):
                spread = [1.0 for x in range(num_cpus)]

        # write checkpoint right before expensive computation starts
        # Currently, write checkpint every 50 rounds,
        # could easily be changed here or exposed to command line
        if (round_ctr % 50) == 0:
            write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores,
                             sorted_keys, outdir)

        if log_fh:
            log_fh.write("Round %d:\n" % round_ctr)
            log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh)

        ideal_flow = seq_to_flow(seqs[key])
        (new_flowgrams, newl) = filter_with_flowgram(key, ideal_flow, flowgrams, header, ids,
                                                     l, bestscores, log_fh, outdir,
                                                     on_cluster=on_cluster,
                                                     num_cpus=num_cpus,
                                                     fast_method=fast_method,
                                                     mapping=cluster_mapping,
                                                     verbose=verbose,
                                                     threshold=threshold,
                                                     pair_id_thresh=pair_id_thresh,
                                                     client_sockets=client_sockets,
                                                     error_profile=error_profile, spread=spread)
        l = newl
        flowgrams = new_flowgrams
        round_ctr += 1
        if(newl == 0):
            # all flowgrams clustered
            break
         # JR: I think this is too much info for the regular user, I leave it in, so
        # we can simply turn it on for debugging
#        if log_fh:
#            log_fh.write("Throughput Spread %s\n" % str(spread))

    if on_cluster:
        stop_workers(client_sockets, log_fh)
        server_socket.close()

    # write all remaining flowgrams into file for next step
    # TODO: might use abstract FlowgramContainer here as well
    fd, non_clustered_filename = mkstemp(dir=outdir, prefix="ff",
                                        suffix=".sff.txt")
    close(fd)
    non_clustered_fh = open(non_clustered_filename, "w")
    write_sff_header(header, non_clustered_fh)
    for f in flowgrams:
        if (f.Name in ids):
            non_clustered_fh.write(f.createFlowHeader() + "\n")

    return(non_clustered_filename, bestscores, cluster_mapping)
Example #8
0
def greedy_clustering(sff_fp,
                      seqs,
                      cluster_mapping,
                      outdir,
                      num_flows,
                      log_fh,
                      num_cpus=1,
                      on_cluster=False,
                      bail_out=1,
                      pair_id_thresh=0.97,
                      verbose=False,
                      threshold=3.75,
                      fast_method=True,
                      error_profile=DENOISER_DATA_DIR +
                      'FLX_error_profile.dat',
                      max_num_rounds=None,
                      checkpoint_fp=None):
    """second clustering phase of denoiser.

    sff_fp: flowgram file
    seqs: fasta seqs corresponding to sff_fp
    cluster_mapping: preliminary cluster mapping from phase I
    outdir: output directory
    num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp)
    log_fh: write verbose info to log_fh if set
    num_cpus:number of cpus to use of on_cluster ==True
    on_cluster: run in paralell if True
    bail_out: stop clustering with first cluster having bail_out members
    pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq
                     identity of pair_id_thresh or higher
    verbose: be verbose or not
    threshold: low clustering threshold for phase II
    fast_method: use more memory intensive but faster method
    error_profile: path to error profile *.dat file
    max_num_rounds: If set, will stop clustering after this many rounds
    """

    (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp))
    l = num_flows

    spread = [1.0 for x in range(num_cpus)]
    (client_sockets, workers) = (None, None)
    if on_cluster:
        (client_sockets, workers, server_socket) = \
            setup_cluster(num_cpus, outdir, verbose, error_profile)

    if checkpoint_fp:
        (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \
            read_checkpoint(checkpoint_fp)
        skipping = True
    else:
        # ids stores all the active sequences
        # we initialize it with the ids from  the seqs dict here,
        # as it starts with all active flows.
        ids = dict.fromkeys(seqs)

        sorted_keys = sort_mapping_by_size(cluster_mapping)

        bestscores = {}
        round_ctr = 1

    # this is the main clustering loop, where most of the compute time is spent
    for key in sorted_keys:
        # skip until we reach the checkpoint
        if checkpoint_fp:
            if (checkpoint_key == key):
                if log_fh:
                    log_fh.write("Resume denoising with %s\n" % key)
                skipping = False
            if (skipping):
                continue

        if (key not in cluster_mapping):
            # this guy already has been clustered
            continue

        if (max_num_rounds and round_ctr > max_num_rounds):
            if log_fh:
                log_fh.write(
                    "Max number of rounds reached. " +
                    "Aborting clustering phase II and continuing with phase III.\n"
                )
            break

        prefix_clustersize = len(cluster_mapping[key])
        # abort greedy first phase
        if (prefix_clustersize < bail_out):
            break

        # Do not take bad sequences as cluster seeds, as this will break the
        # code
        if ('N' in seqs[key]):
            continue

        # check and delete workers if no longer needed
        if on_cluster:
            num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh)
            # check for dead workers
            check_workers(workers, client_sockets, log_fh)
            if num_cpus != len(spread):
                spread = [1.0 for x in range(num_cpus)]

        # write checkpoint right before expensive computation starts
        # Currently, write checkpint every 50 rounds,
        # could easily be changed here or exposed to command line
        if (round_ctr % 50) == 0:
            write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores,
                             sorted_keys, outdir)

        if log_fh:
            log_fh.write("Round %d:\n" % round_ctr)
            log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh)

        ideal_flow = seq_to_flow(seqs[key])
        (new_flowgrams,
         newl) = filter_with_flowgram(key,
                                      ideal_flow,
                                      flowgrams,
                                      header,
                                      ids,
                                      l,
                                      bestscores,
                                      log_fh,
                                      outdir,
                                      on_cluster=on_cluster,
                                      num_cpus=num_cpus,
                                      fast_method=fast_method,
                                      mapping=cluster_mapping,
                                      verbose=verbose,
                                      threshold=threshold,
                                      pair_id_thresh=pair_id_thresh,
                                      client_sockets=client_sockets,
                                      error_profile=error_profile,
                                      spread=spread)
        l = newl
        flowgrams = new_flowgrams
        round_ctr += 1
        if (newl == 0):
            # all flowgrams clustered
            break
        # JR: I think this is too much info for the regular user, I leave it in, so
        # we can simply turn it on for debugging


#        if log_fh:
#            log_fh.write("Throughput Spread %s\n" % str(spread))

    if on_cluster:
        stop_workers(client_sockets, log_fh)
        server_socket.close()

    # write all remaining flowgrams into file for next step
    # TODO: might use abstract FlowgramContainer here as well
    fd, non_clustered_filename = mkstemp(dir=outdir,
                                         prefix="ff",
                                         suffix=".sff.txt")
    close(fd)
    non_clustered_fh = open(non_clustered_filename, "w")
    write_sff_header(header, non_clustered_fh)
    for f in flowgrams:
        if (f.Name in ids):
            non_clustered_fh.write(f.createFlowHeader() + "\n")

    return (non_clustered_filename, bestscores, cluster_mapping)