Ejemplo n.º 1
0
def run(options):
    #
    # Create HDF5 table
    #
    h5file = tables.openFile(filename, mode="a", title="Eta stability data")
    try:
        timings_table = h5file.root.EtaStability.timings
    except tables.NoSuchNodeError:
        group = h5file.createGroup(
            "/", 'EtaStability', 'Data about stability of epsilon')  # Create a new group under "/" (root)
        # Create one table on it
        timings_table = h5file.createTable(
            group, 'timings', Timings, "Timings table")

    #
    # Examine data sets
    #
    data_sets = test_data.data_sets_for_options(options)
    fasta_filenames = [test_data.fasta_filenames[ds] for ds in data_sets]
    for ds, fasta in zip(data_sets, fasta_filenames):
        num_seqs, num_bases = test_data.get_data_set_size(ds)
        logging.info(
            'Analysing data set: %16s; # seqs=%5d; # bases=%7d; %s', ds, num_seqs, num_bases, fasta)

    #
    # set up parallel stuff
    #
    from IPython.kernel import client
    tc = client.TaskClient()

    #
    # pass tasks to engines
    #
    logging.info('Passing tasks to engines')
    task_ids = []
    task_args = dict()
    task_data_set = dict()
    for data_set, fasta in zip(data_sets, fasta_filenames):
        for seed, num_sites, score in test_data.starts[data_set]:
            options.max_num_sites = num_sites
            for epsilon in epsilons:
                # only pass task if we don't have data in the table already
                where_list = timings_table.getWhereList(
                    '(dataset=="%s") & (seed=="%s") & (nsites==%d) & (epsilon>%f) & (epsilon<%f)' % (
                        data_set, seed, math.trunc(
                            num_sites), epsilon - 1e-4, epsilon + 1e-4
                    )
                )
                if options.force or 0 == len(where_list):
                    # remove data if we already have it and are forcing new
                    # calculation
                    if len(where_list):
                        if 1 != len(where_list):
                            raise ValueError(
                                'Expecting to just find one existing row for this start.')
                        timings_table.removeRows(where_list[0])
                    options.output_dir = os.path.abspath(
                        os.path.join('output', 'epsilon-stability', '%s-%03d-%.1f' % (seed, num_sites, epsilon)))
                    os.path.exists(options.output_dir) or os.makedirs(
                        options.output_dir)
                    args = (fasta, seed, num_sites, epsilon, options)
                    # print fasta, seed, num_sites, score, epsilon, options
                    task = client.MapTask(test_stability_and_speed, (
                        fasta, seed, num_sites, score, epsilon, options))
                    task_id = tc.run(task, block=False)
                    task_ids.append(task_id)
                    task_data_set[task_id] = data_set
                    task_args[task_id] = args

    #
    # Get results from engines
    #
    logging.info('Blocking on %d results...', len(task_ids))
    timings = timings_table.row  # Fill the table with data
    for task_id in task_ids:
        duration, post_EM_consensus, num_iters = tc.get_task_result(
            task_id, block=True)
        fasta, seed, num_sites, epsilon, options = task_args[task_id]
        data_set = task_data_set[task_id]
        timings['dataset'] = data_set
        timings['seed'] = seed
        timings['nsites'] = num_sites
        timings['epsilon'] = epsilon
        timings['niters'] = num_iters
        timings['duration'] = duration
        timings['consensus'] = post_EM_consensus
        logging.info(
            '%20s; nsites=%3d; epsilon=%.1f; iters=%5d; elapsed=%7.1fs; per iteration=%6.2fs; %20s; %s',
            seed, num_sites, epsilon, num_iters, duration, duration /
            num_iters, post_EM_consensus, data_set
        )
        timings.append()
    h5file.close()  # Close (and flush) the HDF5 file
Ejemplo n.º 2
0
def run(options):
    #
    # Create HDF5 table
    #
    h5file = tables.openFile(filename, mode="a", title="STEM/MEME data")
    try:
        meme_em_table = h5file.root.MEME.starts
    except tables.NoSuchNodeError:
        meme_group = h5file.createGroup(
            "/", 'MEME', 'Data about MEME runs')  # Create a new group under "/" (root)
        # Create one table on it
        meme_em_table = h5file.createTable(
            meme_group, 'starts', MemeEM, "Info on MEME starts.")

    #
    # Examine data sets
    #
    data_sets = test_data.data_sets_for_options(options)
    fasta_filenames = [test_data.fasta_filenames[ds] for ds in data_sets]
    for ds, fasta in zip(data_sets, fasta_filenames):
        num_seqs, num_bases = test_data.get_data_set_size(ds)
        logging.info(
            'Analysing data set: %16s; # seqs=%5d; # bases=%7d; %s', ds, num_seqs, num_bases, fasta)

    #
    # set up parallel stuff
    #
    from IPython.kernel import client
    tc = client.TaskClient()

    #
    # pass tasks to engines
    #
    logging.info('Passing tasks to engines')
    task_ids = []
    task_args = dict()
    task_data_set = dict()
    for data_set, fasta in zip(data_sets, fasta_filenames):
        for seed, num_sites, score in test_data.starts[data_set]:
            # only pass task if we don't have data in the table already
            if 0 == len(
                meme_em_table.getWhereList(
                    '(dataset=="%s") & (cons0=="%s") & (nsites0==%d)' % (
                        data_set, seed, math.trunc(num_sites)
                    )
                )
            ):
                options.output_dir = os.path.abspath(
                    os.path.join('output', 'meme-em', '%s-%03d' % (seed, num_sites)))
                os.path.exists(options.output_dir) or os.makedirs(
                    options.output_dir)
                args = (fasta, seed, num_sites, options)
                task = client.MapTask(
                    run_meme_on_start, (fasta, seed, num_sites, score, options))
                task_id = tc.run(task, block=False)
                task_ids.append(task_id)
                task_data_set[task_id] = data_set
                task_args[task_id] = args

    #
    # Get results from engines
    #
    logging.info('Blocking on %d results...', len(task_ids))
    meme_em = meme_em_table.row  # Fill the table with data
    for task_id in task_ids:
        start = tc.get_task_result(task_id, block=True)
        fasta, seed, num_sites, options = task_args[task_id]
        assert seed == start.cons0
        data_set = task_data_set[task_id]
        meme_em['dataset'] = data_set
        meme_em['cons0'] = start.cons0
        meme_em['nsites0'] = start.nsites0
        meme_em['niters'] = start.niters
        meme_em['em_time'] = start.em_time
        meme_em['cons'] = start.cons_after_em
        meme_em['nsites'] = start.nsites
        meme_em['sig'] = start.sig
        logging.info(
            '%s: cons0=%20s; nsites0=%3d; niters=%4d; elapsed=%7.1fs; per iteration=%6.2fs; cons=%20s; nsites0=%3d; sig=%e',
            data_set, start.cons0, start.nsites0, start.niters, start.em_time, start.em_time /
            start.niters, start.cons, start.nsites, start.sig
        )
        meme_em.append()
    h5file.close()  # Close (and flush) the HDF5 file
Ejemplo n.º 3
0
            epsilon_hamming[epsilon_index].append(hamming)
            epsilon_fraction_mismatch[epsilon_index].append(
                hamming / float(len(meme_row['cons'])))
            epsilon_mismatches[epsilon_index].append(hamming > 0)
            rel_speed = stem_row['duration'] / meme_row['em_time']
            iter_rel_speed = rel_speed / \
                stem_row['niters'] * meme_row['niters']
            stem_num_iters.append(stem_row['niters'])
            meme_num_iters.append(meme_row['niters'])
            epsilon_rel_speed[epsilon_index].append(rel_speed)
            epsilon_iter_rel_speed[epsilon_index].append(iter_rel_speed)
            if epsilon_index == default_epsilon_index:
                width_iter_rel_speed[W].append(iter_rel_speed)
                num_sites_iter_rel_speed[
                    meme_row['nsites']].append(iter_rel_speed)
                num_seqs, num_bases = test_data.get_data_set_size(data_set)
                stem_runtime_by_size[num_bases].append(
                    np.log10(stem_row['duration']))
                meme_runtime_by_size[num_bases].append(
                    np.log10(meme_row['em_time']))
                stem_itertime_by_size[num_bases].append(
                    np.log10(stem_row['duration']) - np.log10(stem_row['niters']))
                meme_itertime_by_size[num_bases].append(
                    np.log10(meme_row['em_time']) - np.log10(meme_row['niters']))


epsilon_indices = epsilon_mismatches.keys()
epsilon_indices.sort()
epsilon_range = np.arange(len(epsilon_indices))
epsilons = map(index_to_epsilon, epsilon_indices)
str_epsilons = ['%.1f' % e for e in epsilons]