def make_series(model_root_folder, plot_interval=100, limit=None, no_new=False, dont_average_embeddings=False, **run_model_args): average_embeddings = not dont_average_embeddings if average_embeddings: suffix = 'eval-averaged.pkl' else: suffix = 'eval-None-False.pkl' # holdout from when we had include_synsets and normalize_components store_fname = join(model_root_folder, suffix) print store_fname try: stats = pandas.read_pickle(store_fname) print "read pickle from %s" % store_fname except: stats = pandas.DataFrame() print 'created new frame' if no_new: return stats models = models_in_folder(model_root_folder) model_nums = sorted(models.keys()) latest_num = model_nums[-1] if model_nums else -1 latest_num = -1 print 'plotting every %i' % plot_interval to_plot = [n for n in model_nums if n % plot_interval == 0 and n != latest_num] if 1 in model_nums: to_plot = [1] + to_plot if limit is not None: to_plot = [n for n in to_plot if n <= limit] vocab_container = None print model_root_folder for n in to_plot: if n in stats.index: print 'already has %i' % n continue try: print 'loading %i' % n with gzip.open(models[n]) as f: model = cPickle.load(f) except Exception as e: print e continue # load the vocabulary if not already cached if not vocab_container: vocab_container = get_vocab_container(model) embeddings = model.embeddings if not average_embeddings else model.averaged_embeddings() this_stats = run_model(embeddings, vocab_container, **run_model_args) stats = pandas.concat([stats, pandas.DataFrame([this_stats], index=[n])]).sort() stats.to_pickle(store_fname) return stats
def dump_params_from_directory(model_directory, dump_filename='params.json'): try: models = models_in_folder(model_directory) first_model = min(models) model_path = models[first_model] with gzip.open(model_path, 'rb') as f: model = cPickle.load(f) except StopIteration: print 'no models found in %s' % model_directory return except IOError as e: print 'IO error for %s' % model_directory print e return dump_params(model.other_params, join(model_directory, dump_filename))
def models_to_delete(directory, retain_every=10): models = models_in_folder(directory) if not models: print 'no models found' return {}, set() to_keep = set([n for n in models if n % retain_every == 0]) # don't delete the first one, so we can use it as a reference point to_keep.add(min(models)) # the last one may be currently being written to_keep.add(max(models)) return dict((model, path) for model, path in models.items() if model not in to_keep), to_keep
parser.add_argument('--semantic_blocks_to_run', type=int, default=1) args = vars(parser.parse_args()) # if we're only running semantic or syntactic, rho and y init must be 0 to # isolate the loss function to the syntactic or semantic loss if args['dont_run_semantic'] or args['dont_run_syntactic']: print 'not running joint model, setting y and rho to 0' args['rho'] = 0 args['y_init'] = 0 base_dir = args['base_dir'] # see if this model's already been run. If it has, load it and get the # params models = models_in_folder(base_dir) if models: model_num = max(models.keys()) print 'loading existing model %s' % models[model_num] with gzip.open(models[model_num]) as f: model = cPickle.load(f) model_loaded = True args = model.other_params if 'vsgd' not in args: # backward compatibility args['vsgd'] = False if 'simple_joint' not in args: # backward compatibility args['simple_joint'] = False # rewrite in case we've copied the model file into this folder args['base_dir'] = base_dir else:
parser.add_argument('--existing_semantic_model', help='use this existing trained model as the semantic model') parser.add_argument('--semantic_learning_rate', type=float, default=0.01) parser.add_argument('--semantic_tensor_n_hidden', type=int, default=50) # parser.add_argument('--semantic_block_size', type=int, default=100000) # parser.add_argument('--sem_validation_num_nearest', type=int, default=50, help='when running semantic validation after each round, look at the intersection of top N words in wordnet and top N by embedding for a given test word') # parser.add_argument('--sem_validation_num_to_test', type=int, default=500, help='in semantic validation after each round, the number of test words to sample') parser.add_argument('--semantic_blocks_to_run', type=int, default=1) args = vars(parser.parse_args()) print args base_dir = args['base_dir'] # see if this model's already been run. If it has, load it and get the # params models = models_in_folder(base_dir) if models: model_num = max(models.keys()) print 'loading existing model %s' % models[model_num] with gzip.open(models[model_num]) as f: model = cPickle.load(f) model_loaded = True args = model.other_params # rewrite in case we've copied the model file into this folder args['base_dir'] = base_dir else: model_loaded = False # dump the params with open(os.path.join(args['base_dir'], 'params.json'), 'w') as f: json.dump(args, f)
def load_models(base_path, indices=range(0, 1050, 50)): model_paths = utils.models_in_folder(base_path) return { index: load_model(path) for index, path in model_paths.iteritems() if index in indices }