def main(unused_argv): logging.set_verbosity(logging.INFO) is_train = FLAGS.is_train is_tuning_hyper_para = FLAGS.is_tuning_hyper_para if is_train: ks = FLAGS.ks k_list = [int(k.strip()) for k in ks.split(',')] compute_prior_posterior_prob(k_list=k_list, opt_hyper_para=is_tuning_hyper_para) else: model_dir = FLAGS.model_dir k = FLAGS.pred_k output_file = FLAGS.output_file top_k = FLAGS.top_k model_type = FLAGS.model_type batch_size = FLAGS.batch_size num_readers = FLAGS.num_readers feature_names = FLAGS.feature_names feature_sizes = FLAGS.feature_sizes test_data_pattern = FLAGS.test_data_pattern reader = get_reader(model_type, feature_names, feature_sizes) test_data_pipeline = DataPipeline(reader=reader, data_pattern=test_data_pattern, batch_size=batch_size, num_readers=num_readers) train_data_pattern = FLAGS.train_data_pattern inner_reader = get_reader(model_type, feature_names, feature_sizes) train_data_pipeline = DataPipeline(reader=inner_reader, data_pattern=train_data_pattern, batch_size=batch_size, num_readers=num_readers) pred_obj = Predict(train_data_pipeline, model_dir, k=k) pred_obj.make_predictions(test_data_pipeline, output_file, top_k=top_k)
def _read_text_from_file(self, file): ''' read files using reader from readers.py ''' filetype = os.path.splitext(file)[1] reader = get_reader(filetype) try: return reader(file).read() except NotImplementedError: self._errors.append( '"{}" files are not supported..'.format(filetype)) return ''
def main(unused_argv): logging.set_verbosity(tf.logging.INFO) reader = get_reader(FLAGS.feature_names, FLAGS.feature_sizes, FLAGS.frame_features) if FLAGS.output_file is "": raise ValueError("'output_file' was not specified. "\ "Unable to continue with inference.") if FLAGS.test_data_pattern is "": raise ValueError("'test_data_pattern' was not specified. "\ "Unable to continue with inference.") inference(reader, FLAGS.train_dir, FLAGS.test_data_pattern, FLAGS.output_file, FLAGS.batch_size, FLAGS.top_k)
def load_data(fn, options, max_chars=None): read = get_reader(options.input_format) texts, labels = [], [] with open(fn) as f: for ln, (text, text_labels) in enumerate(read(f, fn), start=1): if options.multiclass and not text_labels: raise ValueError(f'missing label on line {ln} in {fn}: {l}') elif options.multiclass and len(text_labels) > 1: raise ValueError(f'multiple labels on line {ln} in {fn}: {l}') texts.append(text[:max_chars]) labels.append(text_labels) print(f'loaded {len(texts)} examples from {fn}', file=sys.stderr) return texts, labels
def main(_): logging.set_verbosity(logging.INFO) # Where training checkpoints are stored. train_model_dirs = FLAGS.train_model_dirs out_file_location = FLAGS.output_file top_k = FLAGS.top_k test_data_pattern = FLAGS.test_data_pattern model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes reader = get_reader(model_type, feature_names, feature_sizes) batch_size = FLAGS.batch_size num_readers = FLAGS.num_readers train_model_dirs_list = [e.strip() for e in train_model_dirs.split(',')] # Get test data. test_data_pipeline = DataPipeline(reader=reader, data_pattern=test_data_pattern, batch_size=batch_size, num_readers=num_readers) # Make inference. inference = BootstrapInference(train_model_dirs_list) inference.transform(test_data_pipeline, out_file_location, top_k=top_k)
def main(unused_argv): env = json.loads(os.environ.get("TF_CONFIG", "{}")) cluster_data = env.get("cluster", None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get("task", None) or {"type": "master", "index": 0} task = type("TaskSpec", (object, ), task_data) logging.set_verbosity(tf.logging.INFO) logging.info(f"{str_task(task)}: Tensorflow version: {tf.__version__}.") if not cluster or task.type == "master" or task.type == "worker": model = find_class_by_name(FLAGS.model, [frame_level_models, video_level_models])() reader = get_reader(FLAGS.feature_names, FLAGS.feature_sizes, FLAGS.frame_features) model_exporter = ModelExporter(frame_features=FLAGS.frame_features, model=model, reader=reader) Trainer(cluster, task, FLAGS.train_dir, model, reader, model_exporter, FLAGS.log_device_placement, FLAGS.export_model_steps, FLAGS.max_steps).run(start_new_model=FLAGS.start_new_model) elif task.type == "ps": ParameterServer(cluster, task).run() else: raise ValueError(f"{str_task(task)}: Invalid task_type: {task.type}.")
def main(unused_argv): """ Training. init_learning_rate: Initial learning rate. decay_steps: How many training steps to decay learning rate once. decay_rate: How much to decay learning rate. l2_reg_rate: l2 regularization rate. epochs: The maximal epochs to pass all training data. """ logging.set_verbosity(logging.INFO) output_dir = FLAGS.output_dir start_new_model = FLAGS.start_new_model init_learning_rate = FLAGS.init_learning_rate decay_steps = FLAGS.decay_steps decay_rate = FLAGS.decay_rate l2_reg_rate = FLAGS.l2_reg_rate train_epochs = FLAGS.train_epochs model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes reader = get_reader(model_type, feature_names, feature_sizes) train_data_pattern = FLAGS.train_data_pattern validate_data_pattern = FLAGS.validate_data_pattern batch_size = FLAGS.batch_size num_readers = FLAGS.num_readers init_with_linear_clf = FLAGS.init_with_linear_clf is_bootstrap = FLAGS.is_bootstrap # Increase num_readers. validate_data_pipeline = DataPipeline(reader=reader, data_pattern=validate_data_pattern, batch_size=batch_size, num_readers=num_readers) if tf.gfile.Exists(path_join(output_dir, 'validate_data.pickle')): with open(path_join(output_dir, 'validate_data.pickle'), 'rb') as f: validate_data = pickle.load(f) with open(path_join(output_dir, 'validate_labels.pickle'), 'rb') as f: validate_labels = pickle.load(f) else: # Sample validate set for line search in linear classifier or logistic regression early stopping. _, validate_data, validate_labels, _ = random_sample( 0.05, mask=(False, True, True, False), data_pipeline=validate_data_pipeline) with open(path_join(output_dir, 'validate_data.pickle'), 'wb') as f: pickle.dump(validate_data, f) with open(path_join(output_dir, 'validate_labels.pickle'), 'wb') as f: pickle.dump(validate_labels, f) start_new_model = start_new_model or (not tf.gfile.Exists(output_dir)) # Set pos_weights for extremely imbalanced situation in one-vs-all classifiers. try: # Load sum_labels in training set, numpy float format to compute pos_weights. train_sum_labels = load_sum_labels() # num_neg / num_pos, assuming neg_weights === 1.0. pos_weights = np.sqrt( (float(NUM_TRAIN_EXAMPLES) - train_sum_labels) / train_sum_labels) logging.info( 'Computing pos_weights based on sum_labels in train set successfully.' ) except IOError: logging.error('Cannot load train sum_labels. Use default value.') pos_weights = None finally: logging.error('Disable pos_weights.') # Set it as None to disable pos_weights. pos_weights = None train_data_pipeline = DataPipeline(reader=reader, data_pattern=train_data_pattern, batch_size=batch_size, num_readers=num_readers) if start_new_model: # Load train data mean and std. train_features_mean, train_features_var = load_features_mean_var( reader) tr_data_fn = standard_scale tr_data_paras = { 'mean': train_features_mean, 'variance': train_features_var, 'reshape': False, 'size': None } if init_with_linear_clf: # ...Start linear classifier... # Compute weights and biases of linear classifier using normal equation. # Linear search helps little. linear_clf = LinearClassifier( logdir=path_join(output_dir, 'linear_classifier')) linear_clf.fit(data_pipeline=train_data_pipeline, tr_data_fn=tr_data_fn, tr_data_paras=tr_data_paras, l2_regs=[ 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0 ], validate_set=(validate_data, validate_labels), line_search=True) linear_clf_weights, linear_clf_biases = linear_clf.weights, linear_clf.biases logging.info( 'linear classifier weights and biases with shape {}, {}'. format(linear_clf_weights.shape, linear_clf_biases.shape)) logging.debug( 'linear classifier weights and {} biases: {}.'.format( linear_clf_weights, linear_clf_biases)) # ...Exit linear classifier... else: linear_clf_weights, linear_clf_biases = None, None else: linear_clf_weights, linear_clf_biases = None, None tr_data_fn = None tr_data_paras = None # Run logistic regression. log_reg = LogisticRegression(logdir=path_join(output_dir, 'log_reg')) log_reg.fit(train_data_pipeline, start_new_model=start_new_model, tr_data_fn=tr_data_fn, tr_data_paras=tr_data_paras, validate_set=(validate_data, validate_labels), validate_fn=gap_fn, bootstrap=is_bootstrap, init_learning_rate=init_learning_rate, decay_steps=decay_steps, decay_rate=decay_rate, epochs=train_epochs, l2_reg_rate=l2_reg_rate, pos_weights=pos_weights, initial_weights=linear_clf_weights, initial_biases=linear_clf_biases)
def compute_prior_posterior_prob(k_list=[8], smooth_para=1.0, opt_hyper_para=False): if (not opt_hyper_para) and (len(k_list) != 1): raise ValueError('Only one k is needed. Check your argument.') model_dir = FLAGS.model_dir model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes reader = get_reader(model_type, feature_names, feature_sizes) train_data_pattern = FLAGS.train_data_pattern batch_size = FLAGS.batch_size num_readers = FLAGS.num_readers train_data_pipeline = DataPipeline(reader=reader, data_pattern=train_data_pattern, batch_size=batch_size, num_readers=num_readers) # Step 1. Compute prior probabilities and store the results. start_time = time.time() sum_labels, accum_num_videos, labels_prior_prob = compute_prior_prob( train_data_pipeline, smooth_para=smooth_para) logging.info('Computing prior probability took {} s.'.format(time.time() - start_time)) save_prior_prob(sum_labels, accum_num_videos, labels_prior_prob, model_dir) # Step 2. Compute posterior probabilities, actually likelihood function or sampling distribution. # Total number of classes. num_classes = reader.num_classes range_num_classes = range(num_classes) max_k = max(k_list) # For each possible class, define a count and counter_count to count. # Compute the posterior probability, namely, given a label l, counting the number of training examples that have # exactly j (0 <= j <= k) nearest neighbors that have label l and normalizing it. # Here, j is considered as a random variable. count_list, counter_count_list = [], [] for k in k_list: count_list.append(np.zeros([k + 1, num_classes], dtype=np.float32)) counter_count_list.append( np.zeros([k + 1, num_classes], dtype=np.float32)) with tf.Graph().as_default() as g: global_step = tf.Variable(0, trainable=False, dtype=tf.int64, name='global_step') global_step_inc_op = global_step.assign_add(1) video_id_batch, video_batch, video_labels_batch, num_frames_batch = ( get_input_data_tensors(train_data_pipeline, num_epochs=1, name_scope='outer_loop')) tf.summary.scalar('global_step', global_step) summary_op = tf.summary.merge_all() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess = tf.Session(graph=g) sess.run(init_op) writer = tf.summary.FileWriter(model_dir, graph=sess.graph) inner_reader = get_reader(model_type, feature_names, feature_sizes) # Be cautious to not be blocked by queue. # Start input enqueue threads. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) tol_num_examples_processed = 0 try: while not coord.should_stop(): # Run training steps or whatever. start_time = time.time() video_id_batch_val, video_batch_val, video_labels_batch_val, global_step_val, summary = sess.run( [ video_id_batch, video_batch, video_labels_batch, global_step_inc_op, summary_op ]) writer.add_summary(summary, global_step=global_step_val) logging.info( 'video_id_batch shape: {}, video_batch shape: {}'.format( video_id_batch_val.shape, video_batch_val.shape)) # Smaller batch size and less number of readers. _train_data_pipeline = DataPipeline( reader=inner_reader, data_pattern=train_data_pattern, batch_size=batch_size, num_readers=num_readers) # Pass values instead of tensors. top_max_k_video_ids, top_max_k_labels = find_k_nearest_neighbors( video_id_batch_val, video_batch_val, _train_data_pipeline, is_train=True, k=max_k) logging.info( 'Finding k nearest neighbors needs {} s.'.format(time.time() - start_time)) # logging.debug('topk_video_ids: {}\ntopk_labels: {}'.format(topk_video_ids, topk_labels)) # Update count_list and counter_count_list. for idx, k in enumerate(k_list): topk_labels = top_max_k_labels[:, :k] # batch_size * delta. deltas = topk_labels.astype(np.int32).sum(axis=1) # Update count and counter_count for each example. for delta, video_labels_val in zip(deltas, video_labels_batch_val): inc = video_labels_val.astype(np.float32) count_list[idx][delta, range_num_classes] += inc counter_count_list[idx][delta, range_num_classes] += 1 - inc # logging.debug('count: {}\ncounter_count: {}'.format(count_list[idx], counter_count_list[idx])) now = time.time() tol_num_examples_processed += video_id_batch_val.shape[0] logging.info( 'Batch processing step {}, elapsed {} s, processed {} examples in total' .format(global_step_val, now - start_time, tol_num_examples_processed)) # Save results regularly. if global_step_val % 4 == 0: # Save models parameters. for k, count, counter_count in zip(k_list, count_list, counter_count_list): # Compute posterior probabilities. pos_prob_positive = (smooth_para + count) / (smooth_para * (k + 1) + count.sum(axis=0)) pos_prob_negative = (smooth_para + counter_count) / ( smooth_para * (k + 1) + counter_count.sum(axis=0)) # Write to files for future use. save_posterior_prob(count, counter_count, pos_prob_positive, pos_prob_negative, k, model_dir) except tf.errors.OutOfRangeError: logging.info('Done training -- one epoch limit reached.') finally: # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads) sess.close() # Save models parameters after passing all examples. for k, count, counter_count in zip(k_list, count_list, counter_count_list): # Compute posterior probabilities. pos_prob_positive = (smooth_para + count) / (smooth_para * (k + 1) + count.sum(axis=0)) pos_prob_negative = (smooth_para + counter_count) / ( smooth_para * (k + 1) + counter_count.sum(axis=0)) # Write to files for future use. save_posterior_prob(count, counter_count, pos_prob_positive, pos_prob_negative, k, model_dir) # Output the best k in validate set. if opt_hyper_para: validate_data_pattern = FLAGS.validate_data_pattern validate_data_pipeline = DataPipeline( reader=reader, data_pattern=validate_data_pattern, batch_size=batch_size, num_readers=num_readers) _, validate_data, validate_labels, _ = random_sample( 0.05, mask=(False, True, True, False), data_pipeline=validate_data_pipeline) best_k = None best_validate_gap = np.NINF for k in k_list: pred_obj = Predict(train_data_pipeline, model_dir, k=k) num_validate_videos = validate_data.shape[0] split_indices = np.linspace( 0, num_validate_videos + 1, num=max(num_validate_videos // batch_size + 1, 2), dtype=np.int32) validate_gaps = [] for i in range(len(split_indices) - 1): start_ind = split_indices[i] end_ind = split_indices[i + 1] ith_predictions = pred_obj.make_batch_predictions( None, validate_data[start_ind:end_ind]) ith_validate_gap = gap_fn(ith_predictions, validate_labels[start_ind:end_ind]) validate_gaps.append(ith_validate_gap * (end_ind - start_ind)) validate_gap = sum(validate_gaps) / num_validate_videos logging.info('k: {}, validate gap: {}'.format(k, validate_gap)) if validate_gap > best_validate_gap: best_k = k best_validate_gap = validate_gap print('Best k: {}, with validate gap {}'.format( best_k, best_validate_gap))
inputlabels = list(map(readers.rmtxt, map(os.path.basename, args.inputfile))) # if not len(inputlabels)==len(args.inputfile): # raise ValueError("Number of labels must match number of inputfiles.") embeddings_list = [] labels = [] if args.onsimplex: embeddings_list_simplex = [] bool_vocabulary = args.vocabulary words_set = None if bool_vocabulary: reader = readers.get_reader(args.inputfile[0]) (dictionary_size, dictionary, reversed_dictionary) = \ reader.read_dictionary(args.inputfile[0]) words_set = set(dictionary.keys()) for inputname, inputlabel in zip(args.inputfile, inputlabels): print("processing %s\n" % inputname) reader = readers.get_reader(inputname) dictionary_size, dictionary, reversed_dictionary, u_embeddings, v_embeddings = \ reader.read_embeddings(inputname, vecsize, consideronlyfirstvec, words_set=words_set) outputbasename = readers.rmtxt(os.path.basename(inputname)) outputpcafolder, outputdistsfolder, outputdistribfolder = make_folders( inputlabel) outputdistsbasename = outputdistsfolder + '/' + outputbasename
def main(unused_argv): """ Train the rbf network. """ logging.set_verbosity(logging.INFO) start_new_model = FLAGS.start_new_model output_dir = FLAGS.output_dir # The ratio of examples to sample as centers (prototypes). num_centers_ratio = FLAGS.num_centers_ratio model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes reader = get_reader(model_type, feature_names, feature_sizes) train_data_pattern = FLAGS.train_data_pattern validate_data_pattern = FLAGS.validate_data_pattern batch_size = FLAGS.batch_size num_readers = FLAGS.num_readers # distance metric, cosine or euclidean. dist_metric = FLAGS.dist_metric init_with_linear_clf = FLAGS.init_with_linear_clf init_learning_rate = FLAGS.init_learning_rate decay_steps = FLAGS.decay_steps decay_rate = FLAGS.decay_rate train_epochs = FLAGS.train_epochs l1_reg_rate = FLAGS.l1_reg_rate l2_reg_rate = FLAGS.l2_reg_rate # ....Start rbf network... logging.info('Entering rbf network...') # Validate set is not stored in graph or meta data. Re-create it any way. # Sample validate set for logistic regression early stopping. validate_data_pipeline = DataPipeline(reader=reader, data_pattern=validate_data_pattern, batch_size=batch_size, num_readers=num_readers) if tf.gfile.Exists(path_join(output_dir, 'validate_data.pickle')): with open(path_join(output_dir, 'validate_data.pickle'), 'rb') as f: validate_data = pickle.load(f) with open(path_join(output_dir, 'validate_labels.pickle'), 'rb') as f: validate_labels = pickle.load(f) else: # Sample validate set. _, validate_data, validate_labels, _ = random_sample( 0.05, mask=(False, True, True, False), data_pipeline=validate_data_pipeline, name_scope='sample_validate') with open(path_join(output_dir, 'validate_data.pickle'), 'wb') as f: pickle.dump(validate_data, f) with open(path_join(output_dir, 'validate_labels.pickle'), 'wb') as f: pickle.dump(validate_labels, f) # DataPipeline consists of reader, batch size, no. of readers and data pattern. train_data_pipeline = DataPipeline(reader=reader, data_pattern=train_data_pattern, batch_size=batch_size, num_readers=num_readers) # If start a new model or output dir does not exist, truly start a new model. start_new_model = start_new_model or (not tf.gfile.Exists(output_dir)) if start_new_model: # PHASE ONE - selecting prototypes c, computing scaling factors sigma. # num_centers = FLAGS.num_centers # num_centers_ratio = float(num_centers) / NUM_TRAIN_EXAMPLES # metric is euclidean or cosine. If cosine, alpha=1.0, otherwise can be less than 1.0. if 'cosine' == dist_metric: # 200 will lead to decreasing drastically and increasing slowly. alpha = 1.0 else: alpha = 1.0 centers, sigmas = initialize(num_centers_ratio, data_pipeline=train_data_pipeline, method='kmeans', metric=dist_metric, scaling_method=4, alpha=alpha) # PHASE TWO - computing linear regression weights and biases. num_centers = centers.shape[0] # Compute mean and variance after data transform. tr_data_fn = rbf_transform tr_data_paras = { 'centers': centers, 'sigmas': sigmas, 'metric': dist_metric, 'reshape': True, 'size': num_centers } """ # Include standard scale to rbf transform. tr_data_mean, tr_data_var = compute_data_mean_var(train_data_pipeline, tr_data_fn=tr_data_fn, tr_data_paras=tr_data_paras) logging.debug('tr_data_mean: {}\ntr_data_var: {}'.format(tr_data_mean, tr_data_var)) tr_data_paras.update({'mean': tr_data_mean, 'variance': tr_data_var}) """ if init_with_linear_clf: # Call linear classification to get a good initial values of weights and biases. linear_clf = LinearClassifier( logdir=path_join(output_dir, 'linear_classifier')) linear_clf.fit(data_pipeline=train_data_pipeline, tr_data_fn=tr_data_fn, tr_data_paras=tr_data_paras, l2_regs=[ 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0 ], validate_set=(validate_data, validate_labels), line_search=True) linear_clf_weights, linear_clf_biases = linear_clf.weights, linear_clf.biases else: linear_clf_weights, linear_clf_biases = None, None # Set pos_weights for extremely imbalanced situation in one-vs-all classifiers. try: # Load sum_labels in training set, numpy float format to compute pos_weights. train_sum_labels = load_sum_labels() # num_neg / num_pos, assuming neg_weights === 1.0. pos_weights = np.sqrt( float(NUM_TRAIN_EXAMPLES) / train_sum_labels - 1.0) logging.info( 'Computing pos_weights based on sum_labels in train set successfully.' ) except IOError: logging.error('Cannot load train sum_labels. Use default value.') pos_weights = None finally: pos_weights = None else: linear_clf_weights, linear_clf_biases = None, None tr_data_fn, tr_data_paras = None, None pos_weights = None # PHASE THREE - fine tuning prototypes c, scaling factors sigma and weights and biases. log_reg_clf = LogisticRegression(logdir=path_join(output_dir, 'log_reg')) log_reg_clf.fit(train_data_pipeline=train_data_pipeline, start_new_model=start_new_model, tr_data_fn=tr_data_fn, tr_data_paras=tr_data_paras, validate_set=(validate_data, validate_labels), validate_fn=gap_fn, init_learning_rate=init_learning_rate, decay_steps=decay_steps, decay_rate=decay_rate, epochs=train_epochs, l1_reg_rate=l1_reg_rate, l2_reg_rate=l2_reg_rate, pos_weights=pos_weights, initial_weights=linear_clf_weights, initial_biases=linear_clf_biases) # ....Exit rbf network... logging.info('Exit rbf network.')
def main(unused_argv): logging.set_verbosity(logging.INFO) start_new_model = FLAGS.start_new_model output_dir = FLAGS.output_dir init_learning_rate = FLAGS.init_learning_rate decay_steps = FLAGS.decay_steps decay_rate = FLAGS.decay_rate l1_reg_rate = FLAGS.l1_reg_rate l2_reg_rate = FLAGS.l2_reg_rate is_bootstrap = FLAGS.is_bootstrap train_epochs = FLAGS.train_epochs model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes reader = get_reader(model_type, feature_names, feature_sizes) train_data_pattern = FLAGS.train_data_pattern validate_data_pattern = FLAGS.validate_data_pattern batch_size = FLAGS.batch_size num_readers = FLAGS.num_readers if tf.gfile.Exists(path_join(output_dir, 'validate_data.pickle')): with open(path_join(output_dir, 'validate_data.pickle'), 'rb') as f: validate_data = pickle.load(f) with open(path_join(output_dir, 'validate_labels.pickle'), 'rb') as f: validate_labels = pickle.load(f) else: # Increase num_readers. validate_data_pipeline = DataPipeline( reader=reader, data_pattern=validate_data_pattern, batch_size=batch_size, num_readers=num_readers) # Sample validate set. _, validate_data, validate_labels, _ = random_sample( 0.05, mask=(False, True, True, False), data_pipeline=validate_data_pipeline, name_scope='sample_validate') with open(path_join(output_dir, 'validate_data.pickle'), 'wb') as f: pickle.dump(validate_data, f) with open(path_join(output_dir, 'validate_labels.pickle'), 'wb') as f: pickle.dump(validate_labels, f) train_data_pipeline = DataPipeline(reader=reader, data_pattern=train_data_pattern, batch_size=batch_size, num_readers=num_readers) model_save_path = path_join(output_dir, 'mlp_fuse') if start_new_model and tf.gfile.Exists(model_save_path): logging.info('Starting a new model...') # Start new model, delete existing checkpoints. try: tf.gfile.DeleteRecursively(model_save_path) except tf.errors.OpError: logging.error('Failed to delete dir {}.'.format(model_save_path)) else: logging.info( 'Succeeded to delete train dir {}.'.format(model_save_path)) # Set pos_weights for extremely imbalanced situation in one-vs-all classifiers. try: # Load sum_labels in training set, numpy float format to compute pos_weights. train_sum_labels = load_sum_labels() # num_neg / num_pos, assuming neg_weights === 1.0. pos_weights = np.sqrt( float(NUM_TRAIN_EXAMPLES) / train_sum_labels - 1.0) logging.info( 'Computing pos_weights based on sum_labels in train set successfully.' ) except IOError: logging.error('Cannot load train sum_labels. Use default value.') pos_weights = None finally: logging.warn('Not to use positive weights.') pos_weights = None train(train_data_pipeline, epochs=train_epochs, pos_weights=pos_weights, l1_reg_rate=l1_reg_rate, l2_reg_rate=l2_reg_rate, init_learning_rate=init_learning_rate, bootstrap=is_bootstrap, validate_set=(validate_data, validate_labels), validate_fn=gap_fn, logdir=model_save_path)
outputfolder = args.outputfolder if outputfolder.endswith('/'): outputfolder = outputfolder[:-1] outputdatafolder = outputfolder + '/' + inputlabel os.makedirs(outputdatafolder, exist_ok=True) tolerance = args.tolerance tolstr = '' if tolerance: tolstr = '-tol' analogiesoutputname = outputdatafolder + '/analogies' + tolstr + '.txt' analogieslogger = init_logger(analogiesoutputname) reader = readers.get_reader("glove") words_set = None if args.vocabulary: (dictionary_size, dictionary, reversed_dictionary) = \ reader.read_dictionary(args.vocabulary) words_set = set(dictionary.keys()) # consideronlyfirstvec=None # if args.subcommand=='linear': consideronlyfirstvec = True # elif args.subcommand=='sphere': # consideronlyfirstvec=False # u_biases and v_biases are not returned at the moment since we do not know what is the use of them we might have in evaluating word analogies dictionary_size, dictionary, reversed_dictionary, u_embeddings, v_embeddings = \ reader.read_embeddings(args.inputfile, args.vecsize, consideronlyfirstvec, words_set=words_set)