def train(run_data: RunData): estimator = create_estimator(run_data) utils.log('Starting train - eval loop excluding data classes: {}'.format( config[consts.EXCLUDED_KEYS])) in_memory_train_eval(estimator, run_data.model) utils.lognl('Finished training with model: {}'.format( run_data.model.summary))
def _create_dataset(dataset_spec: DatasetSpec) -> Path: utils.log("Creating new dataset: {}".format(dataset_spec)) dataset_dir_name = filenames.create_dataset_directory_name(dataset_spec) operation = creating_paired_data.create_paired_data if dataset_spec.paired else creating_unpaired_data.create_unpaired_data features, labels = operation(dataset_spec) full_dir_path = save_to_tfrecord(features, labels, dataset_dir_name, dataset_spec) utils.log("Dataset saved into {}".format(full_dir_path)) return full_dir_path.parent
def eval_with_excludes_input_fn(self) -> Dataset: utils.log('Creating eval_input_fn with excluded elements') test_ignoring_excludes = DatasetSpec( raw_data_provider=self.raw_data_provider, type=DatasetType.TEST, with_excludes=True, encoding=self.is_encoded()) return self.supply_dataset(dataset_spec=test_ignoring_excludes, batch_size=self.calculate_batch_size())
def eval_input_fn(self) -> Dataset: utils.log('Creating eval_input_fn') test_data_config = DatasetSpec( raw_data_provider=self.raw_data_provider, type=DatasetType.TEST, with_excludes=False, encoding=self.is_encoded()) return self.supply_dataset(dataset_spec=test_data_config, batch_size=self.calculate_batch_size())
def prepare_infer_env(run_data: RunData): config.update_model_params(run_data.model.params) config.update_launcher_params(run_data.launcher_params) inference_dir = filenames.get_infer_dir(run_data) filename = filenames.create_infer_log_name(run_data.model) _set_logging_handlers([(inference_dir / filename)]) utils.log("Inference data will be saved into: {}".format(inference_dir)) _check_model_checkpoint_existence(run_data) _log_inference_model(run_data)
def copy_text_log(run_data): inference_dir = filenames.get_infer_dir(run_data) run_text_logs_dir = filenames.get_run_text_logs_dir(run_data) if not run_text_logs_dir.exists(): utils.log( "{} not exists - not copying text log".format(run_text_logs_dir)) return latest_log = find_latest_ing_dir(run_text_logs_dir) import shutil shutil.copy(str(latest_log.absolute()), str((inference_dir / latest_log.name).absolute()))
def create_estimator(run_data: RunData): model = run_data.model utils.log('Creating estimator from model: {}'.format(model.summary)) model_dir = str(filenames.get_run_logs_data_dir(run_data)) params = model.params params[consts.MODEL_DIR] = model_dir return tf.estimator.Estimator(model_fn=model.get_model_fn(), model_dir=model_dir, config=tf.estimator.RunConfig( keep_checkpoint_max=1, save_checkpoints_secs=60 * 30), params=params)
def train_input_fn(self) -> Dataset: utils.log('Creating train_input_fn') train_data_config = DatasetSpec( raw_data_provider=self.raw_data_provider, type=DatasetType.TRAIN, with_excludes=False, encoding=self.is_encoded(), paired=self.is_train_paired()) return self.supply_dataset( dataset_spec=train_data_config, shuffle_buffer_size=config[consts.SHUFFLE_BUFFER_SIZE], batch_size=config[consts.BATCH_SIZE], repeat=True)
def infer(self, take_num: int) -> Dataset: utils.log('Creating infer_fn') test_with_excludes = DatasetSpec( raw_data_provider=self.raw_data_provider, type=DatasetType.TEST, with_excludes=True, encoding=self.is_encoded()) return self.supply_dataset( dataset_spec=test_with_excludes, batch_size=take_num, repeat=False, shuffle_buffer_size=config[consts.SHUFFLE_BUFFER_SIZE], prefetch=False, take_num=take_num)
def find_or_create_dataset_dir(dataset_spec: DatasetSpec) -> Path: processed_datasets_dir: Path = filenames.get_processed_input_data_dir( dataset_spec) utils.log("Searching for dataset: {} in {}".format(dataset_spec, processed_datasets_dir)) if processed_datasets_dir.exists(): matcher_fn: Callable[[str], bool] = get_dataset_dir_matcher_fn(dataset_spec) for folder in processed_datasets_dir.glob('*'): if matcher_fn(folder.name) and not_empty(folder): utils.log("Dataset found: {} full path: {}".format( folder.name, folder.resolve())) return folder return _create_dataset(dataset_spec)
def build_dataset(self, dataset_spec: DatasetSpec): utils.log('Creating generator for: {}'.format(dataset_spec)) self.excludes = config[ consts.EXCLUDED_KEYS] if not dataset_spec.with_excludes else [] self.raw_images, self.raw_labels = raw_data.get_raw_data(dataset_spec) self.unique_labels = np.unique(self.raw_labels) self.label_to_idxs_mapping = { label: np.flatnonzero(self.raw_labels == label) for label in self.unique_labels } if dataset_spec.type is DatasetType.TRAIN: return self.build_from_generator(dataset_spec) else: return build_from_tfrecord(dataset_spec)
def save_to_tfrecord(data_dict: Dict[str, np.ndarray], data_labels: Dict[str, np.ndarray], path: Path, dataset_spec: DatasetSpec): utils.log('Saving .tfrecord file: {} using spec: {}'.format( path, dataset_spec)) path.parent.mkdir(parents=True, exist_ok=True) storage_method = dataset_spec.raw_data_provider.description.storage_method if storage_method == DatasetStorageMethod.IN_MEMORY: saver = get_from_memory_saver(dataset_spec) elif storage_method == DatasetStorageMethod.ON_DISC: saver = get_from_disc_saver(dataset_spec) else: raise NotImplementedError("Storage method {} is not implemented ", storage_method) saver(dataset_spec).save(data_dict, data_labels, path)
def assemble_dataset(input_data_dir: Path, dataset_spec: DatasetSpec) -> TFRecordDataset: def all_names_in_dir(dir): return [str(f) for f in dir.iterdir()][0] files_to_assemble = all_names_in_dir(input_data_dir) utils.log( 'Assembling dataset from .tfrecord file(s): {}, dataset spec: {}'. format(files_to_assemble, dataset_spec)) dataset = tf.data.TFRecordDataset(filenames=files_to_assemble) reader = resolve_reader(dataset_spec) dataset = dataset.map(reader.get_decode_op(), num_parallel_calls=64) return dataset
def determine_optimizer(optimizer_param: str, learning_rate: float): utils.log("Creating optimizer: {}, with learning rate: {}".format( optimizer_param, learning_rate)) if optimizer_param == consts.GRADIENT_DESCEND_OPTIMIZER: return tf.train.GradientDescentOptimizer(learning_rate) elif optimizer_param == consts.MOMENTUM_OPTIMIZER: return tf.train.MomentumOptimizer(learning_rate, 0.99, use_nesterov=False) elif optimizer_param == consts.NESTEROV_OPTIMIZER: return tf.train.MomentumOptimizer(learning_rate, 0.99, use_nesterov=True) elif optimizer_param == consts.ADAM_OPTIMIZER: return tf.train.AdamOptimizer(learning_rate) else: raise ValueError("Unknown optimizer: {}".format(optimizer_param))
def buy(self): client = self.client ok = True start = 0 bid = 850 won_items = [] available_slots = len(self.client.unassigned()) while ok: items = client.searchAuctions('development', max_buy=bid, defId=5002006, fast=False, start=start) items = items[::-1] if not len(items): print('No items dawg') start = 0 continue start = start + len(items) miss = 0 for item in items: state = client.bid(item['tradeId'], bid) if state: won_items.append(item['tradeId']) data = '%d, %d \n' % (bid, item['tradeId']) log(data, filename='./data/sniper.csv') print('Snipped %d for %d' % (item['tradeId'], bid)) available_slots -= 1 else: print('Lost it') bid = 900 miss += 1 if miss > self.MAX_MISSES: start += len(items) bid = 850 break if len(won_items) >= 50 or available_slots <= 50: ok = False break print('Still running') sleep(3)
def _prepare_dirs(deleted_old_exp_path: Union[None, Path], run_data: RunData): if run_data.is_experiment: if deleted_old_exp_path: utils.log( 'Found not empty experiment dir from previous runs: {}'.format( deleted_old_exp_path)) utils.log( 'Deleting old experiment dir: {}'.format(deleted_old_exp_path)) else: launcher_dir = filenames.get_launcher_dir(run_data) utils.log('Experiment dir from previous runs not found.'.format( launcher_dir)) utils.log('Creating experiment dir: {}'.format(launcher_dir)) _prepare_runs_dir(run_data) _prepare_log_dir(run_data) launcher_dir = filenames.get_run_dir(run_data).parent utils.log(consts.TENSORBOARD_COMMAND.format(launcher_dir))
def sold(self): tradepile = self.client.tradepile() sold = 0 bids = 0 for i in range(0, len(tradepile)): if tradepile[i]['tradeState'] == TRADE_STATUS_CLOSED: sold += 1 bids += tradepile[i]['currentBid'] if bids == 0: msg = 'Nothing sold at this point' print(msg) sendMessage(msg) return 0 revenue = int(bids * EA_TAX_PERCENTAGE) msg = 'Sold %s items for %s coins' % (sold, revenue) csv = str(sold) + ',' + str(revenue) + '\n' sendMessage(msg) log(csv) return sold
def _create_unpaired_data( examples: np.ndarray, labels: np.ndarray, dataset_spec: DatasetSpec ) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]: if dataset_spec.with_excludes: keys_to_drop = [] else: keys_to_drop = config[consts.EXCLUDED_KEYS] utils.log("Creating unpaired data excluding keys: " + str(keys_to_drop)) examples = np.array(examples) labels = np.array(labels) if keys_to_drop: indexes = np.logical_and.reduce([labels != x for x in keys_to_drop]) examples = examples[indexes] labels = labels[indexes] examples, labels = unison_shuffle(examples, labels) features_dict = collections.OrderedDict( {consts.FEATURES: np.array(examples)}) labels_dict = collections.OrderedDict({consts.LABELS: np.array(labels)}) return features_dict, labels_dict
def create_pair_summaries(run_data: RunData): dataset_provider_cls = run_data.model.raw_data_provider tf.reset_default_graph() batch_size = 10 utils.log('Creating {} sample features summaries'.format(batch_size)) dataset: tf.data.Dataset = run_data.model.dataset_provider.supply_dataset( dataset_spec=DatasetSpec( dataset_provider_cls, DatasetType.TEST, with_excludes=False, encoding=run_data.model.dataset_provider.is_encoded()), shuffle_buffer_size=10000, batch_size=batch_size, prefetch=False) iterator = dataset.make_one_shot_iterator() iterator = iterator.get_next() with tf.Session() as sess: left = iterator[0][consts.LEFT_FEATURE_IMAGE] right = iterator[0][consts.RIGHT_FEATURE_IMAGE] pair_labels = iterator[1][consts.PAIR_LABEL] left_labels = iterator[1][consts.LEFT_FEATURE_LABEL] right_labels = iterator[1][consts.RIGHT_FEATURE_LABEL] pairs_imgs_summary = create_pair_summary( left, right, pair_labels, left_labels, right_labels, dataset_provider_cls.description) image_summary = tf.summary.image('paired_images', pairs_imgs_summary, max_outputs=batch_size) all_summaries = tf.summary.merge_all() dir = filenames.get_run_logs_data_dir(run_data) / 'features' dir.mkdir(exist_ok=True, parents=True) writer = tf.summary.FileWriter(str(dir), sess.graph) sess.run(tf.global_variables_initializer()) summary = sess.run(all_summaries) writer.add_summary(summary) writer.flush()
def distributed_train_eval(mnist_estimator): for _ in range(4): mnist_estimator.train( input_fn=lambda: supplying_datasets.train_input_fn(), steps=dataset_size // config[consts.BATCH_SIZE] * epochs_between_eval) if config[consts.EXCLUDED_KEYS]: eval_name = filenames.create_excluded_name_fragment() else: eval_name = None eval_results = mnist_estimator.evaluate( input_fn=lambda: supplying_datasets.eval_input_fn(), name=eval_name) utils.log('Evaluation results: {}'.format(eval_results)) if config[consts.EXCLUDED_KEYS]: eval_results = mnist_estimator.evaluate( input_fn=lambda: supplying_datasets.eval_with_excludes_fn(), name='full') utils.log('Evaluation results for whole dataset: {}'.format( eval_results))
def _check_model_checkpoint_existence(run_data: RunData): strict: bool = config[consts.IS_INFER_CHECKPOINT_OBLIGATORY] if not strict: utils.log("Not checking checkpoint existence") return model_dir = filenames.get_run_logs_data_dir(run_data) assert model_dir.exists(), "{} does not exists - no model to load!".format( model_dir) checkpoints = model_dir.glob('*.ckpt-*') checkpoints_with_number = { x for y in checkpoints for x in str(y).split('.') if x.startswith("ckpt") } step_numbers = {int(x.split('-')[-1]) for x in checkpoints_with_number} assert bool(step_numbers), "No checkpoints exists!" assert len( step_numbers ) > 1 or 0 not in step_numbers, "Only one checkpoint - for 0th step exists!" utils.log("Checkpoint directory: ok, max checkoint number: {}".format( max(step_numbers)))
def fake_cnn_model_fn(self, features, labels, mode, params): utils.log('Creating graph wih mode: {}'.format(mode)) self.model_fn_calls += 1 with tf.name_scope('left_cnn_stack'): flatten_left_stack = self.create_simple_cnn_layers( features[consts.LEFT_FEATURE_IMAGE]) with tf.name_scope('right_cnn_stack'): flatten_right_stack = self.create_simple_cnn_layers( features[consts.RIGHT_FEATURE_IMAGE]) flatted_concat = tf.concat( axis=1, values=[flatten_left_stack, flatten_right_stack]) logits = tf.layers.dense(inputs=flatted_concat, units=2) predictions = { "classes": tf.argmax(input=logits, axis=1), "probabilities": tf.nn.softmax(logits, name="softmax_tensor") } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) pair_labels = labels[consts.PAIR_LABEL] loss = tf.losses.sparse_softmax_cross_entropy(labels=pair_labels, logits=logits) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec(mode=mode, loss=loss) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = determine_optimizer(config[consts.OPTIMIZER])( config[consts.LEARNING_RATE]) train_op = optimizer.minimize( loss=loss, global_step=tf.train.get_or_create_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def plot_predicted_data(run_data: RunData, dicts_dataset: DictsDataset, predictions: Dict[str, np.ndarray], show): model = run_data.model inference_dir = filenames.get_infer_dir(run_data) utils.log("Plotting pairs...") image_summaries.create_pairs_board( dataset=dicts_dataset, predicted_labels=model.get_predicted_labels(predictions), predicted_scores=model.get_predicted_scores(predictions), path=inference_dir / filenames.summary_to_name(model, suffix=consts.PNG, with_date_fragment=False, name=consts.INFER_PLOT_BOARD_NAME), show=show) if model.produces_2d_embedding: utils.log("Plotting distances...") x, y = map_pair_of_points_to_plot_data( predictions[consts.INFERENCE_LEFT_EMBEDDINGS], predictions[consts.INFERENCE_RIGHT_EMBEDDINGS]) labels = dicts_dataset.labels image_summaries.create_distances_plot( left_coors=x, right_coors=y, labels_dict=labels, infer_result=predictions, path=inference_dir / filenames.summary_to_name(model, suffix=consts.PNG, with_date_fragment=False, name=consts.INFER_PLOT_DISTANCES_NAME), show=show) utils.log("Plotting clusters...") image_summaries.create_clusters_plot( feat=np.concatenate( (predictions[consts.INFERENCE_LEFT_EMBEDDINGS], predictions[consts.INFERENCE_RIGHT_EMBEDDINGS])), labels=np.concatenate((labels.left, labels.right)), path=inference_dir / filenames.summary_to_name(model, suffix=consts.PNG, with_date_fragment=False, name=INFER_PLOT_CLUSTERS_NAME), show=show)
def _prepare_log_dir(run_data: RunData): log_dir = filenames.get_run_logs_data_dir(run_data) if utils.check_filepath(filename=log_dir, exists=True, is_directory=True, is_empty=False): utils.log( 'Found not empty logs directory from previous runs: {}'.format( log_dir)) if config[consts.REMOVE_OLD_MODEL_DIR]: utils.log('Deleting old model_dir: {}'.format(log_dir)) shutil.rmtree(str(log_dir)) else: utils.log( 'Logs directory from previous runs not found. Creating new: {}'. format(log_dir)) log_dir.mkdir(exist_ok=False, parents=True)
def after_run(run_data: RunData): launcher_dir = filenames.get_run_dir(run_data).parent utils.log(consts.TENSORBOARD_COMMAND.format(launcher_dir))
def triplet_batch_all_model_fn(self, features, labels, mode, params): utils.log('Creating graph wih mode: {}'.format(mode)) features = unpack_features(features, self.is_dataset_paired(mode)) embeddings = self.conv_net(features) embedding_mean_norm = tf.reduce_mean(tf.norm(embeddings, axis=1)) tf.summary.scalar("embedding_mean_norm", embedding_mean_norm) middle_idx = tf.cast(tf.shape(embeddings)[0] / 2, tf.int64) left_embeddings = embeddings[:middle_idx] right_embeddings = embeddings[middle_idx:] distances = calculate_distance(left_embeddings, right_embeddings) output = is_pair_similar(distances, config[consts.PREDICT_SIMILARITY_MARGIN]) predictions = { consts.INFERENCE_CLASSES: output, consts.INFERENCE_DISTANCES: distances, consts.INFERENCE_LEFT_EMBEDDINGS: left_embeddings, consts.INFERENCE_RIGHT_EMBEDDINGS: right_embeddings, } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) labels, pair_labels = unpack_labels(labels, self.is_dataset_paired(mode)) loss, fraction_positive_triplets, num_positive_triplets, num_valid_triplets = batch_all_triplet_loss( labels, embeddings, margin=config[consts.HARD_TRIPLET_MARGIN]) if mode == tf.estimator.ModeKeys.EVAL: accuracy_metric = tf.metrics.accuracy( labels=pair_labels, predictions=predictions[consts.INFERENCE_CLASSES], name='accuracy_metric') recall_metric = tf.metrics.recall( labels=pair_labels, predictions=predictions[consts.INFERENCE_CLASSES], name='recall_metric') precision_metric = tf.metrics.precision( labels=pair_labels, predictions=predictions[consts.INFERENCE_CLASSES], name='precision_metric') f1_metric = tf.contrib.metrics.f1_score( labels=pair_labels, predictions=predictions[consts.INFERENCE_CLASSES], name='f1_metric') mean_metric = tf.metrics.mean(values=distances, name=consts.INFERENCE_CLASSES) eval_metric_ops = { consts.METRIC_ACCURACY: accuracy_metric, consts.METRIC_RECALL: recall_metric, consts.METRIC_PRECISION: precision_metric, consts.METRIC_F1: f1_metric, consts.METRIC_MEAN_DISTANCE: mean_metric, } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = estimator_conv_model.determine_optimizer( config[consts.OPTIMIZER], config[consts.LEARNING_RATE]) train_op = optimizer.minimize( loss=loss, global_step=tf.train.get_or_create_global_step()) training_logging_hook_dict = {} if self.is_dataset_paired(mode): non_streaming_accuracy = estimator_conv_model.non_streaming_accuracy( tf.cast(tf.squeeze(predictions[consts.INFERENCE_CLASSES]), tf.int32), tf.cast(pair_labels, tf.int32)) tf.summary.scalar('accuracy', non_streaming_accuracy) training_logging_hook_dict.update( {"accuracy_logging": non_streaming_accuracy}) non_streaming_distances = tf.reduce_mean(distances) tf.summary.scalar('mean_distance', non_streaming_distances) tf.summary.scalar('postitive_triplets', num_positive_triplets) training_logging_hook_dict.update( {"distances_logging": non_streaming_distances}) training_logging_hook_dict.update({ "fraction_positive_triplets": fraction_positive_triplets, "num_positive_triplets": num_positive_triplets, "num_valid_triplets": num_valid_triplets, }) logging_hook = tf.train.LoggingTensorHook( training_logging_hook_dict, every_n_iter=config[consts.TRAIN_LOG_STEPS_INTERVAL]) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, training_hooks=[logging_hook])
def _log_inference_model(run_data: RunData): utils.log("Initiate model for inference, name: {}, summary: {}".format( run_data.launcher_name, run_data.model.summary)) _log_config(run_data)
def _log_configuration(args: List[str], run_data: RunData): _log_config(run_data) commandline_args = [x for x in args if not x.startswith('--')] undefined_flags = [x for x in args if x.startswith('--')] utils.log('Remainder commandline arguments: {}'.format(commandline_args)) utils.log('Undefined commandline flags: {}'.format(undefined_flags))
def _log_config(run_data: RunData): utils.log('Code-defined params: {}'.format(config.file_defined_params)) utils.log('Model params: {}'.format(config.model_params)) utils.log('Launcher params: {}'.format(config.launcher_params)) utils.log('Commandline flags: {}'.format(config.tf_flags)) utils.log(config.pretty_full_dict_summary(run_data))
def _prepare_runs_dir(run_data: RunData): run_dir = filenames.get_run_dir(run_data) if not run_dir.exists(): utils.log('Creating directory for run: {}'.format(run_dir)) run_dir.mkdir(parents=True, exist_ok=True)