Exemple #1
0
def init_level(name, level):
    """Initialize a given level of the search by saving all of the structures which need
    to be evaluated."""
    if not storage.exists(experiment_dir(name)):
        raise RuntimeError('Experiment %s not yet initialized.' % name)

    params = storage.load(params_file(name))
    splits = storage.load(splits_file(name))

    if level == 1:
        init_structures = ['g']
    else:
        init_structures = storage.load(winning_structure_file(name, level - 1))

    structure_pairs = list_structure_pairs(init_structures, params.rules, params.expand_noise)
    data_matrix = storage.load(data_file(name))
    X_train = data_matrix
    lab = None
    node_mat = np.zeros([params.num_splits * params.num_samples, 200, 200, 2])
    pruned_pairs = []
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # ; config.gpu_options.per_process_gpu_memory_fraction = 0.4
    sess = tf.Session(config=config)
    c = Classifier()
    real = tf.placeholder(shape=[None, 200, 200, 2], dtype=tf.float32)
    c_out = tf.reduce_mean(tf.nn.softmax(c(real), axis=-1), axis=0, keepdims=True)
    c_params = c.vars
    saver = tf.train.Saver(c_params)
    sess.run(tf.global_variables_initializer())
    saver.restore(sess, "saved_model/d")
    for (init_structure, structure) in structure_pairs:
        for split_id in range(params.num_splits):
            for sample_id in range(params.num_samples):
                train_rows, train_cols, test_rows, test_cols = splits[split_id]
                X_train = data_matrix[train_rows[:, nax], train_cols[nax, :]]
                if level == 1:
                    init = X_train.sample_latent_values(np.zeros((X_train.m, X_train.n)), 1.)
                    prev_model = recursive.GaussianNode(init, 'scalar', 1.)
                else:
                    try:
                        prev_model = storage.load(samples_file(name, level - 1, init_structure, split_id, sample_id))
                    except:
                        print("structure", grammar.pretty_print(init_structure), "never exists")
                        continue
                    if isinstance(prev_model, recursive.Decomp):
                        prev_model = prev_model.root
                node, old_dist, rule = recursive.find_changed_node(prev_model, init_structure, structure)
                lab = labelize(rule)
                node_mat[split_id * params.num_samples + sample_id] = pad(random_shrink(node.value()))

        if_continue = sess.run(tf.nn.top_k(c_out, 3), feed_dict={real: node_mat})
        if lab in if_continue.indices:
            print("transformation structure ", grammar.pretty_print(init_structure), "->", grammar.pretty_print(structure), "i.e. lab ", lab,
                  " included with top_k", if_continue)
            pruned_pairs.append((init_structure, structure))
        else:
            print("transformation structure ", grammar.pretty_print(init_structure), "->", grammar.pretty_print(structure), "i.e. lab ", lab, " emitted, with top_k", if_continue)
    structure_pairs = pruned_pairs
    storage.dump(structure_pairs, structures_file(name, level))
Exemple #2
0
def initialize(data_matrix, root, old_structure, new_structure, num_iter=200):
    root = root.copy()
    if old_structure == new_structure:
        return root
    node, old_dist, rule = recursive.find_changed_node(root, old_structure, new_structure)

    old = root.value()

    # if we're replacing the root, pass on the observation model; otherwise, treat
    # the node we're factorizing as exact real-valued observations
    if node is root:
        inner_data_matrix = data_matrix
    else:
        row_ids = recursive.row_ids_for(data_matrix, node)
        col_ids = recursive.col_ids_for(data_matrix, node)
        m_orig, n_orig = recursive.orig_shape_for(data_matrix, node)
        frv = observations.DataMatrix.from_real_values
        inner_data_matrix = frv(node.value(), row_ids=row_ids, col_ids=col_ids,
                                m_orig=m_orig, n_orig=n_orig)

    print('Initializing %s from %s...' % (grammar.pretty_print(new_structure), grammar.pretty_print(old_structure)))

    if rule == grammar.parse("gg+g"):
        new_node = init_low_rank(inner_data_matrix, num_iter=num_iter)
    elif rule == grammar.parse("mg+g"):
        isotropic = (node is root)
        new_node = init_row_clustering(inner_data_matrix, isotropic, num_iter=num_iter)
    elif rule == grammar.parse("gM+g"):
        isotropic = (node is root)
        new_node = init_col_clustering(inner_data_matrix, isotropic, num_iter=num_iter)
    elif rule == grammar.parse("bg+g"):
        new_node = init_row_binary(inner_data_matrix, num_iter=num_iter)
    elif rule == grammar.parse("gB+g"):
        new_node = init_col_binary(inner_data_matrix, num_iter=num_iter)
    elif rule == grammar.parse("cg+g"):
        new_node = init_row_chain(inner_data_matrix, num_iter=num_iter)
    elif rule == grammar.parse("gC+g"):
        new_node = init_col_chain(inner_data_matrix, num_iter=num_iter)
    elif rule == grammar.parse("s(g)"):
        new_node = init_sparsity(inner_data_matrix, node.variance_type, num_iter=num_iter)
    else:
        raise RuntimeError('Unknown production rule: %s ==> %s' % (grammar.pretty_print(old_dist),
                                                                   grammar.pretty_print(rule)))

    root = recursive.splice(root, node, new_node)

    if isinstance(data_matrix.observations, observations.RealObservations):
        assert np.allclose(root.value()[data_matrix.observations.mask], old[data_matrix.observations.mask])

    return root
def initialize(data_matrix, root, old_structure, new_structure, num_iter=200):
    root = root.copy()
    if old_structure == new_structure:
        return root
    node, old_dist, rule = recursive.find_changed_node(root, old_structure, new_structure)

    old = root.value()

    # if we're replacing the root, pass on the observation model; otherwise, treat
    # the node we're factorizing as exact real-valued observations
    if node is root:
        inner_data_matrix = data_matrix
    else:
        row_ids = recursive.row_ids_for(data_matrix, node)
        col_ids = recursive.col_ids_for(data_matrix, node)
        m_orig, n_orig = recursive.orig_shape_for(data_matrix, node)
        frv = observations.DataMatrix.from_real_values
        inner_data_matrix = frv(node.value(), row_ids=row_ids, col_ids=col_ids,
                                m_orig=m_orig, n_orig=n_orig)

    print 'Initializing %s from %s...' % (grammar.pretty_print(new_structure), grammar.pretty_print(old_structure))

    if rule == grammar.parse("gg+g"):
        new_node = init_low_rank(inner_data_matrix, num_iter=num_iter)
    elif rule == grammar.parse("mg+g"):
        isotropic = (node is root)
        new_node = init_row_clustering(inner_data_matrix, isotropic, num_iter=num_iter)
    elif rule == grammar.parse("gM+g"):
        isotropic = (node is root)
        new_node = init_col_clustering(inner_data_matrix, isotropic, num_iter=num_iter)
    elif rule == grammar.parse("bg+g"):
        new_node = init_row_binary(inner_data_matrix, num_iter=num_iter)
    elif rule == grammar.parse("gB+g"):
        new_node = init_col_binary(inner_data_matrix, num_iter=num_iter)
    elif rule == grammar.parse("cg+g"):
        new_node = init_row_chain(inner_data_matrix, num_iter=num_iter)
    elif rule == grammar.parse("gC+g"):
        new_node = init_col_chain(inner_data_matrix, num_iter=num_iter)
    elif rule == grammar.parse("s(g)"):
        new_node = init_sparsity(inner_data_matrix, node.variance_type, num_iter=num_iter)
    else:
        raise RuntimeError('Unknown production rule: %s ==> %s' % (grammar.pretty_print(old_dist),
                                                                   grammar.pretty_print(rule)))

    root = recursive.splice(root, node, new_node)

    if isinstance(data_matrix.observations, observations.RealObservations):
        assert np.allclose(root.value()[data_matrix.observations.mask], old[data_matrix.observations.mask])

    return root
Exemple #4
0
def print_components(model, structure, row_or_col, items, outfile=sys.stdout):
    cluster_members = collections.defaultdict(list)
    if model == 'clustering':
        for item in items:
            z = item.z if np.isscalar(item.z) else item.z.argmax()
            cluster_members[z].append(item.label)

        component_type, component_type_pl = 'Cluster', 'clusters'
    elif model == 'binary':
        for item in items:
            for i, zi in enumerate(item.z):
                if zi:
                    cluster_members[i].append(item.label)
        component_type, component_type_pl = 'Component', 'components'

    cluster_ids = sorted(cluster_members.keys(),
                         key=lambda k: len(cluster_members[k]),
                         reverse=True)

    row_col_str = {'row': 'row', 'col': 'column'}[row_or_col]
    print('For structure %s, the following %s %s were found:' % \
          (grammar.pretty_print(structure), row_col_str, component_type_pl), file=outfile)
    print(file=outfile)

    for i, cid in enumerate(cluster_ids):
        print('    %s %d:' % (component_type, i + 1), file=outfile)
        print(file=outfile)
        for label in cluster_members[cid]:
            print('        %s' % label, file=outfile)
        print(file=outfile)
    print(file=outfile)
def collect_scores(name, level, structure):
    """Collect the held-out predictive log-likelihood scores for all CV splits and
    order them according to the indices of the original data matrix."""
    params = storage.load(params_file(name))
    splits = storage.load(splits_file(name))

    row_loglik_all = []
    col_loglik_all = []
    failed = False

    for split_id, (train_rows, train_cols, test_rows, test_cols) in enumerate(splits):
        row_loglik_curr, col_loglik_curr = [], []
        num_samples = params.num_samples
        for sample_id in range(num_samples):
            try:
                row_loglik_single, col_loglik_single = storage.load(scores_file(name, level, structure, split_id, sample_id))
            except:
                row_loglik_single = np.nan * np.ones(len(test_rows))
                col_loglik_single = np.nan * np.ones(len(test_cols))
                failed = True
            row_loglik_curr.append(row_loglik_single)
            col_loglik_curr.append(col_loglik_single)

        row_loglik_all.append(np.array(row_loglik_curr))
        col_loglik_all.append(np.array(col_loglik_curr))

    if failed:
        print termcolor.colored('    failed: %s' % grammar.pretty_print(structure), 'red')

    storage.dump((row_loglik_all, col_loglik_all), collected_scores_file(name, level, structure))
def print_components(model, structure, row_or_col, items, outfile=sys.stdout):
    cluster_members = collections.defaultdict(list)
    if model == 'clustering':
        for item in items:
            z = item.z if np.isscalar(item.z) else item.z.argmax()
            cluster_members[z].append(item.label)

        component_type, component_type_pl = 'Cluster', 'clusters'
    elif model == 'binary':
        for item in items:
            for i, zi in enumerate(item.z):
                if zi:
                    cluster_members[i].append(item.label)
        component_type, component_type_pl = 'Component', 'components'
            
    cluster_ids = sorted(cluster_members.keys(), key=lambda k: len(cluster_members[k]), reverse=True)

    row_col_str = {'row': 'row', 'col': 'column'}[row_or_col]
    print >> outfile, 'For structure %s, the following %s %s were found:' % \
          (grammar.pretty_print(structure), row_col_str, component_type_pl)
    print >> outfile

    for i, cid in enumerate(cluster_ids):
        print >> outfile, '    %s %d:' % (component_type, i+1)
        print >> outfile
        for label in cluster_members[cid]:
            print >> outfile, '        %s' % label
        print >> outfile
    print >> outfile
Exemple #7
0
def collect_scores(name, level, structure):
    """Collect the held-out predictive log-likelihood scores for all CV splits and
    order them according to the indices of the original data matrix."""
    params = storage.load(params_file(name))
    splits = storage.load(splits_file(name))

    row_loglik_all = []
    col_loglik_all = []
    failed = False

    for split_id, (train_rows, train_cols, test_rows, test_cols) in enumerate(splits):
        row_loglik_curr, col_loglik_curr = [], []
        num_samples = params.num_samples
        for sample_id in range(num_samples):
            try:
                row_loglik_single, col_loglik_single = storage.load(scores_file(name, level, structure, split_id, sample_id))
            except Exception as e:
                if isinstance(e, FileNotFoundError):
                    return
                row_loglik_single = np.nan * np.ones(len(test_rows))
                col_loglik_single = np.nan * np.ones(len(test_cols))
                failed = True
            row_loglik_curr.append(row_loglik_single)
            col_loglik_curr.append(col_loglik_single)

        row_loglik_all.append(np.array(row_loglik_curr))
        col_loglik_all.append(np.array(col_loglik_curr))

    if failed:
        print(termcolor.colored('    failed: %s' % grammar.pretty_print(structure), 'red'))

    storage.dump((row_loglik_all, col_loglik_all), collected_scores_file(name, level, structure))
def print_scores(level, model_scores, outfile=sys.stdout):
    print >> outfile, 'The following are the top-scoring structures for level %d:' % level
    print >> outfile
    print >> outfile, '%30s%10s%10s%13s%13s%13s%10s%10s' % \
          ('structure', 'row', 'col', 'total', 'row impvt.', 'col impvt.', 'z (row)', 'z (col)')
    print >> outfile
    for ms in model_scores:
        print >> outfile, '%30s%10.2f%10.2f%13.2f%13.2f%13.2f%10.2f%10.2f' % \
              (grammar.pretty_print(ms.structure), ms.row_score, ms.col_score, ms.total,
               ms.row_improvement, ms.col_improvement, ms.z_score_row, ms.z_score_col)
    print >> outfile
    print >> outfile
def print_model_sequence(model_scores, outfile=sys.stdout):
    print >> outfile, "Here are the best-performing structures in each level of the search:"
    print >> outfile
    print >> outfile, '%10s%25s%13s%13s%10s%10s' % \
          ('level', 'structure', 'row impvt.', 'col impvt.', 'z (row)', 'z (col)')
    print >> outfile
    for i, ms in enumerate(model_scores):
        print >> outfile, '%10d%25s%13.2f%13.2f%10.2f%10.2f' % \
              (i+1, grammar.pretty_print(ms.structure), ms.row_improvement, ms.col_improvement,
               ms.z_score_row, ms.z_score_col)
    print >> outfile
    print >> outfile
def print_scores(level, model_scores, outfile=sys.stdout):
    print >> outfile, 'The following are the top-scoring structures for level %d:' % level
    print >> outfile
    print >> outfile, '%30s%10s%10s%13s%13s%13s%10s%10s' % \
          ('structure', 'row', 'col', 'total', 'row impvt.', 'col impvt.', 'z (row)', 'z (col)')
    print >> outfile
    for ms in model_scores:
        print >> outfile, '%30s%10.2f%10.2f%13.2f%13.2f%13.2f%10.2f%10.2f' % \
              (grammar.pretty_print(ms.structure), ms.row_score, ms.col_score, ms.total,
               ms.row_improvement, ms.col_improvement, ms.z_score_row, ms.z_score_col)
    print >> outfile
    print >> outfile
def print_model_sequence(model_scores, outfile=sys.stdout):
    print >> outfile, "Here are the best-performing structures in each level of the search:"
    print >> outfile
    print >> outfile, '%10s%25s%13s%13s%10s%10s' % \
          ('level', 'structure', 'row impvt.', 'col impvt.', 'z (row)', 'z (col)')
    print >> outfile
    for i, ms in enumerate(model_scores):
        print >> outfile, '%10d%25s%13.2f%13.2f%10.2f%10.2f' % \
              (i+1, grammar.pretty_print(ms.structure), ms.row_improvement, ms.col_improvement,
               ms.z_score_row, ms.z_score_col)
    print >> outfile
    print >> outfile
def print_running_times(running_times, outfile=sys.stdout):
    total = sum([rt.total_time for rt in running_times])
    print >> outfile, 'Total CPU time was %s. Here is the breakdown:' % format_time(total)
    print >> outfile
    print >> outfile, '%30s%8s        %s' % \
          ('structure', 'level', 'time')
    print >> outfile
    running_times = sorted(running_times, key=lambda rt: rt.total_time, reverse=True)
    for rt in running_times:
        time_str = '%d  x  %s' % (rt.num_samples, format_time(rt.total_time / rt.num_samples))
        print >> outfile, '%30s%8d        %s' % (grammar.pretty_print(rt.structure), rt.level, time_str)
    print >> outfile
    print >> outfile
def print_learned_structures(results, outfile=sys.stdout):
    def sortkey(result):
        return result.expt_name.split('_')[-1]
    results = sorted(results, key=sortkey)

    print >> outfile, 'The learned structures:'
    print >> outfile
    print >> outfile, '%25s%25s' % ('experiment', 'structure')
    print >> outfile
    for r in results:
        print >> outfile, '%25s%25s' % (r.expt_name, grammar.pretty_print(r.structure))
    print >> outfile
    print >> outfile
Exemple #14
0
def print_learned_structures(results, outfile=sys.stdout):
    def sortkey(result):
        return result.expt_name.split('_')[-1]

    results = sorted(results, key=sortkey)

    print('The learned structures:', file=outfile)
    print(file=outfile)
    print('%25s%25s' % ('experiment', 'structure'), file=outfile)
    print(file=outfile)
    for r in results:
        print('%25s%25s' % (r.expt_name, grammar.pretty_print(r.structure)),
              file=outfile)
    print(file=outfile)
    print(file=outfile)
def print_failed_structures(failures, outfile=sys.stdout):
    if failures:
        print >> outfile, 'The inference algorithms failed for the following structures:'
        print >> outfile
        print >> outfile, '%30s%8s        %s' % \
              ('structure', 'level', 'notes')
        print >> outfile
        for f in failures:
            line = '%30s%8d        ' % (grammar.pretty_print(f.structure), f.level)
            if f.name:
                line += '(for %s)  ' % f.name
            if not f.all_failed:
                line += '(only some jobs failed)  '
            print >> outfile, line
        print >> outfile
        print >> outfile
def print_failed_structures(failures, outfile=sys.stdout):
    if failures:
        print >> outfile, 'The inference algorithms failed for the following structures:'
        print >> outfile
        print >> outfile, '%30s%8s        %s' % \
              ('structure', 'level', 'notes')
        print >> outfile
        for f in failures:
            line = '%30s%8d        ' % (grammar.pretty_print(
                f.structure), f.level)
            if f.name:
                line += '(for %s)  ' % f.name
            if not f.all_failed:
                line += '(only some jobs failed)  '
            print >> outfile, line
        print >> outfile
        print >> outfile
Exemple #17
0
def print_failed_structures(failures, outfile=sys.stdout):
    if failures:
        print('The algorithm has failed in the following structures:',
              file=outfile)
        print(file=outfile)
        print('%30s%8s        %s' % \
              ('structure', 'level', 'notes'), file=outfile)
        print(file=outfile)
        for f in failures:
            line = '%30s%8d        ' % (grammar.pretty_print(
                f.structure), f.level)
            if f.name:
                line += '(for %s)  ' % f.name
            if not f.all_failed:
                line += '(only some splits failed.)  '
            print(line, file=outfile)
        print(file=outfile)
        print(file=outfile)
def print_running_times(running_times, outfile=sys.stdout):
    total = sum([rt.total_time for rt in running_times])
    print >> outfile, 'Total CPU time was %s. Here is the breakdown:' % format_time(
        total)
    print >> outfile
    print >> outfile, '%30s%8s        %s' % \
          ('structure', 'level', 'time')
    print >> outfile
    running_times = sorted(running_times,
                           key=lambda rt: rt.total_time,
                           reverse=True)
    for rt in running_times:
        time_str = '%d  x  %s' % (rt.num_samples,
                                  format_time(rt.total_time / rt.num_samples))
        print >> outfile, '%30s%8d        %s' % (grammar.pretty_print(
            rt.structure), rt.level, time_str)
    print >> outfile
    print >> outfile
Exemple #19
0
def sweep(data_matrix, root, num_iter=100, maximize=False):
    samplers = get_samplers(data_matrix, root, maximize)

    if num_iter > 1:
        print 'Dumb Gibbs sampling on %s...' % grammar.pretty_print(root.structure())
        pbar = misc.pbar(num_iter)
    else:
        pbar = None
        
    for it in range(num_iter):
        for sampler in samplers:
            if sampler.preserves_root_value():
                old = root.value()
            sampler.step()
            if sampler.preserves_root_value():
                assert np.allclose(old, root.value())

        if pbar is not None:
            pbar.update(it)
    if pbar is not None:
        pbar.finish()
def format_structure(structure, latex=False):
    if latex:
        return '$' + grammar.pretty_print(structure).upper().replace("'", "^T") + '$'
    else:
        return grammar.pretty_print(structure)
def pretty_print(structure):
    return grammar.pretty_print(structure, False, False)
Exemple #22
0
def format_structure(structure, latex=False):
    if latex:
        return '$' + grammar.pretty_print(structure).upper().replace("'", "^T") + '$'
    else:
        return grammar.pretty_print(structure)
Exemple #23
0
def pretty_print(structure):
    return grammar.pretty_print(structure, False, False)