Exemple #1
0
def optimize_parameter_lbfgs(model, param_name, f, g, bounds=(1e-4, None), disp=0, max_evals=100):
    from scipy.optimize import fmin_l_bfgs_b

    p = ModelParameterAcessor(model, param_name)

    # Scipy expects function parameters to be 1d, so we have to ravel/unravel the parameter values for each
    # evaluation
    def eval_f(param_as_list):
        old_value = p.get()  # Save old
        p.set_flattened(param_as_list)  # Set new
        f_val = f()
        p.set(old_value)  # Restore old value
        return -f_val

    def eval_g(param_as_list):
        old_value = p.get()  # Save old
        p.set_flattened(param_as_list)  # Set new
        g_val = ravel(g())
        p.set(old_value)  # Restore old value
        return -g_val

    x0 = ravel(p.get())
    #bounds = [(1e-4, None) for each in x0]  # Keep the parameter positive
    bounds = [bounds] * len(x0)

    old_f_val = f()
    x, new_f_val, d = fmin_l_bfgs_b(eval_f, x0, fprime=eval_g, bounds=bounds, maxfun=max_evals, disp=disp)
    p.set_flattened(x)
    new_f_val = f()
    log.info('Optimized %s; improvement: %g' % (param_name, new_f_val - old_f_val))
Exemple #2
0
    def __init__(self, argv=None, kw=None):
        self.output_files = []

        if kw and argv:
            raise ValueError('Must provide at most one of the argv or kw kwargs')

        # If the class doesn't specify 'binary', use the python file in which the class was defined.
        if self.binary is None:
            self.binary = os.path.abspath(inspect.getfile(self.__class__))
            log.info('Guessing binary %s' % self.binary)

        if not os.path.isfile(self.binary):
            raise Exception('Unable to locate binary %s for condorizable job' % self.binary)

        if kw is not None:
            argv = [self.binary] + kwargs_to_argv(kw)
        elif argv is not None:
            argv = list(argv)

        # Install the sigterm handler
        signal.signal(signal.SIGTERM, self.sigterm_handler)

        self.argv = argv
        if self.argv is not None:
            self.parse_argv_and_run(self.argv)
Exemple #3
0
    def parse_argv_and_run(self, argv=None):
        if argv is not None:
            self.argv = argv

        condorize = self.CONDOR_FLAG in self.argv
        if condorize:
            self.argv.remove(self.CONDOR_FLAG)
        log_output = self.CONDOR_LOG_FLAG in self.argv
        if log_output:
            self.argv.remove(self.CONDOR_LOG_FLAG)
            if not condorize:
                raise Exception('Flag %s only applies to condor jobs' % self.CONDOR_LOG_FLAG)

        # Check the arguments, even if this is a condor job being started.  This allows condor jobs to fail fast,
        # rather than dying on a remote node.
        options = self.check_args(self.argv)
        if options is None:
            raise Exception('check_args function must return options structure; got None instead')

        for filename in self.output_files:
            self.check_output_file_is_unlocked(filename)

        if condorize:
            log.info('Condorizing %s' % ' '.join(self.argv))
            self.run_on_condor(self.argv, log_output=log_output)
            return

        try:
            self.lock_output_files_or_die()
            self.run(options)
        finally:
            self.on_exit()
Exemple #4
0
def optimize_parameter_lbfgs(model, param_name, f, g, bounds=(1e-4, None), disp=0, max_evals=100):
    from scipy.optimize import fmin_l_bfgs_b

    p = ModelParameterAcessor(model, param_name)

    # Scipy expects function parameters to be 1d, so we have to ravel/unravel the parameter values for each
    # evaluation
    def eval_f(param_as_list):
        old_value = p.get()  # Save old
        p.set_flattened(param_as_list)  # Set new
        f_val = f()
        p.set(old_value)  # Restore old value
        return -f_val

    def eval_g(param_as_list):
        old_value = p.get()  # Save old
        p.set_flattened(param_as_list)  # Set new
        g_val = ravel(g())
        p.set(old_value)  # Restore old value
        return -g_val

    x0 = ravel(p.get())
    #bounds = [(1e-4, None) for each in x0]  # Keep the parameter positive
    bounds = [bounds] * len(x0)

    old_f_val = f()
    x, new_f_val, d = fmin_l_bfgs_b(eval_f, x0, fprime=eval_g, bounds=bounds, maxfun=max_evals, disp=disp)
    p.set_flattened(x)
    new_f_val = f()
    log.info('Optimized %s; improvement: %g' % (param_name, new_f_val - old_f_val))
Exemple #5
0
def run(argv):
    parser = ArgumentParser()
    parser.add_argument('vem_model', type=str, help='SAM VEM model to use features from')
    parser.add_argument('-c', type=float, default=1.0, help='SVM C parameter')
    options = parser.parse_args(argv[1:])

    log.info('Loading SAM model %s' % options.vem_model)

    sam_model = VEMModel.load(options.vem_model)
    log.info('Making dataset')
    dataset = make_dataset(sam_model)

    metric = ClassificationError()
    scores = []
    for i in range(20):
        train_data, test_data = dataset.split(p=0.90, seed=i)

        topic_svm = TopicSVM(sam_model, C=options.c, normalize=True)
        topic_svm.train(train_data)

        predictions = topic_svm.predict(test_data)
        score = metric(test_data.targets, predictions)
        log.info(score)
        scores.append(score)
    log.info('Mean classification error: %g' % np.mean(scores))
Exemple #6
0
def run(argv):
    parser = ArgumentParser()
    parser.add_argument('vem_model',
                        type=str,
                        help='SAM VEM model to use features from')
    parser.add_argument('-c', type=float, default=1.0, help='SVM C parameter')
    options = parser.parse_args(argv[1:])

    log.info('Loading SAM model %s' % options.vem_model)

    sam_model = VEMModel.load(options.vem_model)
    log.info('Making dataset')
    dataset = make_dataset(sam_model)

    metric = ClassificationError()
    scores = []
    for i in range(20):
        train_data, test_data = dataset.split(p=0.90, seed=i)

        topic_svm = TopicSVM(sam_model, C=options.c, normalize=True)
        topic_svm.train(train_data)

        predictions = topic_svm.predict(test_data)
        score = metric(test_data.targets, predictions)
        log.info(score)
        scores.append(score)
    log.info('Mean classification error: %g' % np.mean(scores))
    def run(self, options):
        labeler = labelers.registry[options.labeler]

        # Wait to instantiate the corpus writer until we know the dimensionality of the descriptors we'll be writing
        filenames = open(options.file_list).readlines()
        labels = [labeler(each) for each in filenames]
        class_list = sorted(set(labels))

        writer = ArffWriter(options.dest, class_list=class_list)
        log.info('Writing GIST data to %s' % options.dest)

        for i, (filename, label) in enumerate(izip(filenames, labels)):
            filename = filename.strip()
            log.info('Processing image %d/%d' % (i+1, len(filenames)))

            descriptor = color_gist(filename) if options.color else grayscale_gist(filename)

            if options.normalize:
                descriptor = l2_normalize(descriptor)
            writer.write_example(descriptor, label)
        writer.close()
    def run(self, options):
        labeler = None if options.labeler is None else labelers.registry[options.labeler]

        # Wait to instantiate the corpus writer until we know the dimensionality of the descriptors we'll be writing
        writer = None
        log.info('Writing SAM corpus to %s' % options.dest_corpus)

        filenames = open(options.file_list).readlines()
        for i, filename in enumerate(filenames):
            filename = filename.strip()
            log.info('Processing image %d/%d' % (i+1, len(filenames)))

            descriptor = color_gist(filename) if options.color else grayscale_gist(filename)
            if writer is None:
                dim = descriptor.size
                writer = CorpusWriter(options.dest_corpus, data_series='sam', dim=dim)

            normalized_descriptor = l2_normalize(descriptor)
            doc_label = labeler(filename) if labeler else None
            writer.write_doc(ascolvector(normalized_descriptor), name=filename, label=doc_label)

        writer.close()
Exemple #9
0
def optimize_parameter(model, param_name, f, g, bounds=(1e-4, None), disp=0, max_evals=100):
    from scipy.optimize import fmin_tnc

    p = ModelParameterAcessor(model, param_name)

    # Scipy expects function parameters to be 1d, so we have to ravel/unravel the parameter values for each
    # evaluation
    def negative_f_and_f_prime(param_as_list):
        old_value = p.get()  # Save old
        p.set_flattened(param_as_list)  # Set new
        f_val = -f()
        f_prime_val = ravel(-g())
        p.set(old_value)  # Restore old value
        return f_val, f_prime_val

    x0 = ravel(p.get())
    bounds = [bounds] * len(x0)

    old_f_val = f()
    x, nfeval, rc = fmin_tnc(negative_f_and_f_prime, x0=x0, bounds=bounds, disp=disp, maxfun=max_evals)
    p.set_flattened(x)
    new_f_val = f()
    log.info('Optimized %s; improvement: %g' % (param_name, new_f_val - old_f_val))
Exemple #10
0
def optimize_parameter_lbfgs_coor(model, f, g, bounds=(1e-4, None), disp=0, max_evals=10):
    from scipy.optimize import fmin_l_bfgs_b

    doc_x = ModelParameterAcessor(model, 'x')
    topic_delta = ModelParameterAcessor(model, 'delta')

    # Scipy expects function parameters to be 1d, so we have to ravel/unravel the parameter values for each
    # evaluation
    def eval_f(param_as_list):
        old_value_doc_x = doc_x.get()  # Save old
        old_value_topic_delta = topic_delta.get()
        doc_x.set_flattened(param_as_list[:model.num_docs * model.dim])  # Set new
        topic_delta.set_flattened(param_as_list[model.num_docs * model.dim:])
        f_val = -f()
        doc_x.set(old_value_doc_x)  # Restore old value
        topic_delta.set(old_value_topic_delta)
        return f_val

    def eval_g(param_as_list):
        old_value_doc_x = doc_x.get()  # Save old
        old_value_topic_delta = topic_delta.get()
        doc_x.set_flattened(param_as_list[:model.num_docs * model.dim])  # Set new
        topic_delta.set_flattened(param_as_list[model.num_docs * model.dim:])
        f_prime_val = -g()
        doc_x.set(old_value_doc_x)  # Restore old value
        topic_delta.set(old_value_topic_delta)
        return f_prime_val

    x0 = np.concatenate([ravel(doc_x.get()),ravel(topic_delta.get())])
#     bounds = [bounds] * len(x0)

    old_f_val = f()
    x, new_f_val, d = fmin_l_bfgs_b(eval_f, x0, fprime=eval_g, maxfun=max_evals, disp=disp)
    doc_x.set_flattened(x[:model.num_docs * model.dim])
    topic_delta.set_flattened(x[model.num_docs * model.dim:])
    new_f_val = f()
    log.info('Optimized %s; improvement: %g' % ('x,delta', new_f_val - old_f_val))
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = ArgumentParser()
    parser.add_argument('input_file', type=str, help='Input file in evidence format')
    parser.add_argument('output_file', type=str, help='Path to destination corpus file')
    parser.add_argument('--labeler', type=str, help='Labeler to apply')
    options = parser.parse_args(argv[1:])

    labeler = None
    if options.labeler is None:
        log.warning('no labeler provided')
    elif options.labeler not in labelers.registry:
        labeler_names = ', '.join(sorted(labelers.registry.keys()))
        parser.error('Invalid labeler "%s"; available options are %s' % (options.labeler, labeler_names))
    else:
        labeler = labelers.registry[options.labeler]

    instance_dict = load_evidence_file(options.input_file)
    num_docs = len(instance_dict)
    feature_ids = sorted(set(chain(*[each.iterkeys() for each in instance_dict.values()])))
    vocab_size = len(feature_ids)
    log.info('Read %d docs (vocabulary size %d) from %s' % (num_docs, vocab_size, options.input_file))

    log.info('Writing L2-normalized corpus to %s' % options.output_file)
    writer = CorpusWriter(options.output_file, data_series='sam', dim=vocab_size)

    # Create a map of feature_id => dense feature index
    feature_index = {k:i for i, k in enumerate(feature_ids)}

    # For each document, convert sparse features to dense L2-normalized feature vector and write it to the corpus
    for name, sparse_features in instance_dict.iteritems():
        doc_data = np.zeros((vocab_size, 1))
        for id, count in sparse_features.iteritems():
            doc_data[feature_index[id]] = count
        doc_data = l2_normalize(doc_data)
        doc_label = labeler(name) if labeler else None

        writer.write_doc(doc_data, name=name, label=doc_label)
    writer.close()

    wordlist_path = options.output_file + '.wordlist'
    log.info('Writing wordlist to %s' % wordlist_path)
    with open(wordlist_path, 'w') as f:
        f.writelines([s + '\n' for s in feature_ids])
Exemple #12
0
def check_grads(model):
    assert np.isfinite(model.l_alpha())
    assert np.isfinite(model.l_valpha())

    x = model.grad_l_vmu()
    assert np.isfinite(x).all()

    import pdb
    try:
        # Main update rules
        log.info('xi update:', check_grad(model, 'xi', model.l_xi, model.grad_l_xi))
        log.info('valpha update:', check_grad(model, 'valpha', model.l_valpha, model.grad_l_valpha))
        log.info('alpha update:', check_grad(model, 'alpha', model.l_alpha, model.grad_l_alpha))

        log.info('vmu update:', check_grad(model, 'vmu', model.l_vmu, model.tangent_grad_l_vmu))

        f = lambda: avk(model.V, model.xi)
        g = lambda: deriv_avk(model.V, model.xi)
        log.info('avk_xi', check_grad(model, 'xi', f, g))

        f = lambda: np.sum(model.e_squared_norm_batch())
        g = lambda: np.sum(model.grad_e_squared_norm_xi())
        log.info('grad_esn_xi', check_grad(model, 'xi', f, g))

        f = lambda: np.sum(model.rho_batch())
        g = lambda: np.sum(model.deriv_rho_xi())
        log.info('deriv_rho_xi', check_grad(model, 'xi', f, g))

    except Exception, e:
        log.error(e)
        pdb.post_mortem()
Exemple #13
0
    def run(self, options):
        if os.path.exists(options.model):
            log.info('Loading model snapshot from %s' % options.model)
            model = VEMModel.load(options.model)
        else:
            # Initialize a model from scratch
            log.info('Initializing new model on %s [T=%d]' % (options.corpus, options.T))
            reader = CorpusReader(options.corpus, data_series='sam')
            model = VEMModel(reader=reader, T=options.T)

        while model.iteration < options.iterations:
            log.info('** Iteration %d / %d **' % (model.iteration + 1, options.iterations))
            model.run_one_iteration()

            if model.iteration % SAVE_MODEL_INTERVAL == 0:
                log.info('Saving model snapshot...')
                model.save(options.model)

            if model.iteration % SAVE_TOPICS_INTERVAL == 0:
                if options.write_topics:
                    log.info('Saving topics to %s' % options.write_topics)
                    with open(options.write_topics, 'w') as f:
                        model.write_topics(f)

                if options.write_topic_weights:
                    log.info('Saving topic weights to %s' % options.write_topic_weights)
                    with open(options.write_topic_weights, 'w') as f:
                        model.write_topic_weights_arff(f)

        if options.write_topics:
            log.info('Saving topics to %s' % options.write_topics)
            with open(options.write_topics, 'w') as f:
                model.write_topics(f)

        if options.write_topic_weights:
            log.info('Saving topic weights to %s' % options.write_topic_weights)
            with open(options.write_topic_weights, 'w') as f:
                model.write_topic_weights_arff(f)
        model.save(options.model)
Exemple #14
0
    def run(self, options):
        if os.path.exists(options.model):
            log.info('Loading model snapshot from %s' % options.model)
            model = VEMModel.load(options.model)
        else:
            # Initialize a model from scratch
            log.info('Initializing new model on %s [T=%d]' %
                     (options.corpus, options.T))
            reader = CorpusReader(options.corpus, data_series='sam')
            model = VEMModel(reader=reader, T=options.T)

        while model.iteration < options.iterations:
            log.info('** Iteration %d / %d **' %
                     (model.iteration + 1, options.iterations))
            model.run_one_iteration()

            if model.iteration % SAVE_MODEL_INTERVAL == 0:
                log.info('Saving model snapshot...')
                model.save(options.model)

            if model.iteration % SAVE_TOPICS_INTERVAL == 0:
                if options.write_topics:
                    log.info('Saving topics to %s' % options.write_topics)
                    with open(options.write_topics, 'w') as f:
                        model.write_topics(f)

                if options.write_topic_weights:
                    log.info('Saving topic weights to %s' %
                             options.write_topic_weights)
                    with open(options.write_topic_weights, 'w') as f:
                        model.write_topic_weights_arff(f)

        if options.write_topics:
            log.info('Saving topics to %s' % options.write_topics)
            with open(options.write_topics, 'w') as f:
                model.write_topics(f)

        if options.write_topic_weights:
            log.info('Saving topic weights to %s' %
                     options.write_topic_weights)
            with open(options.write_topic_weights, 'w') as f:
                model.write_topic_weights_arff(f)
        model.save(options.model)
Exemple #15
0
 def on_exit(self):
     for output_file in self.output_files:
         lock_file = self.get_lock_file_for(output_file)
         if os.path.isfile(lock_file):
             log.info('Removing lock file %s' % lock_file)
             os.remove(lock_file)