def __init__(self, args: Namespace): self.args = args # Generate Run ID if len(self.args.run_id) > 0: self.run_id: str = self.args.run_id else: self.run_id = "-".join( [socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) self.start_time = time.time() self.logs_dir: str = os.path.join(self.args.logs_dir, self.run_id) # Setup logging file_io.make_directory(self.logs_dir, clear_dir=True, log=None) self.logger: Logger = self.setup_logging() self.logger.info("Logging initialized. Saving logs to : {}".format( self.logs_dir)) self.logger.info("Run ID: {}".format(self.run_id)) self.logger.info("CLI args: \n{}".format( json.dumps(vars(self.args)).replace(",", "\n"))) # Setup directories self.models_dir: str = os.path.join(self.args.models_dir, self.run_id) self.data_dir: str = self.args.data_dir file_io.make_directory(self.models_dir, clear_dir=False, log=self.logger) # Read/Parse model config YAML self.model_config_file = self.args.model_config # Setup other arguments self.loss_key: str = self.args.loss_key self.optimizer_key: str = self.args.optimizer_key if self.args.metrics_keys[0] == "[": self.metrics_keys: List[str] = ast.literal_eval( self.args.metrics_keys) else: self.metrics_keys = [self.args.metrics_keys] self.data_format: str = self.args.data_format self.tfrecord_type: str = self.args.tfrecord_type # Validate args self.validate_args() # Set random seeds self.set_seeds() # Load and parse feature config self.feature_config: FeatureConfig = parse_config( tfrecord_type=self.tfrecord_type, feature_config=self.args.feature_config, logger=self.logger, ) self.logger.info("Feature config parsed and loaded") # Finished initialization self.logger.info("Relevance Pipeline successfully initialized!")
def get_ranking_dataset(self, data_dir: str, data_format: str, feature_config_path: str): feature_config: FeatureConfig = parse_config( tfrecord_type=self.args.tfrecord_type, feature_config=feature_config_path, logger=self.logger, ) relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, logger=self.logger, ) return relevance_dataset
def get_feature_config(self): feature_config_path = os.path.join(self.root_data_dir, "tfrecord", self.feature_config_fname) feature_config: FeatureConfig = parse_config( tfrecord_type=self.args.tfrecord_type, feature_config=feature_config_path, logger=self.logger, ) return feature_config
def run_default_pipeline(self, data_dir: str, data_format: str, feature_config_path: str): """Train a model with the default set of args""" metrics_keys = ["MRR"] # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) feature_config: FeatureConfig = parse_config( tfrecord_type=self.args.tfrecord_type, feature_config=feature_config_path, logger=self.logger, ) relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, logger=self.logger, ) ranking_model: RankingModel = self.get_ranking_model( loss_key=self.args.loss_key, feature_config=feature_config, metrics_keys=metrics_keys ) ranking_model.fit(dataset=relevance_dataset, num_epochs=1, models_dir=self.output_dir) loss = dict( zip( ranking_model.model.metrics_names, ranking_model.model.evaluate(relevance_dataset.test), ) )["loss"] new_MRR = ranking_model.evaluate( test_dataset=relevance_dataset.test, logs_dir=self.args.logs_dir, )[0]["new_MRR"] return loss, new_MRR
def run_dataset_creation(data_dir: str = DATA_DIR, out_dir: str = OUT_DIR, feature_config: str = FEATURE_CONFIG, feature_highval: dict = FEATURE_HIGHVAL, feature_num_results: str = FEATURE_NUM_RESULTS, max_num_records: int = MAX_NUM_RECORDS, num_samples: int = NUM_SAMPLES, random_state: int = RANDOM_STATE): """ 1. Loads example data 2. Builds specified synthetic data size by sampling from example data 3. Adds catastrophic failures specifically 4. For now, write out to CSV. In future could return df directly """ # Setup logging logger: Logger = setup_logging() try: # Set seeds set_seeds(random_state) logger.info( 'Set seeds with initial random state {}'.format(random_state)) # Load and parse feature config feature_config: FeatureConfig = parse_config( tfrecord_type='', feature_config=feature_config, logger=logger) logger.info("Feature config parsed and loaded") # Create output location file_io.make_directory(out_dir, log=logger) out_file = os.path.join( out_dir, 'synthetic_data_{}.csv'.format( dt.datetime.now().strftime('%Y%m%d-%H%M%S'))) # Build data seed_data = load_seed_data(data_dir, logger) df_synthetic = fill_data(seed_data, max_num_records, feature_config, feature_highval, feature_num_results, num_samples, logger) file_io.write_df(df_synthetic, outfile=out_file, index=False) logger.info('Synthetic data created! Location: {}'.format(out_file)) return df_synthetic except Exception as e: logger.error("!!! Error creating synthetic data: !!!\n{}".format( str(e))) traceback.print_exc() return
def run_default_pipeline(self, data_dir: str, data_format: str, feature_config_path: str): """Train a model with the default set of args""" feature_config: FeatureConfig = parse_config( tfrecord_type=self.args.tfrecord_type, feature_config=feature_config_path, logger=self.logger, ) data_dir = os.path.join(self.root_data_dir, "tfrecord") data_format = "tfrecord" metrics_keys = ["categorical_accuracy", "MRR", "ACR"] relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, logger=self.logger, ) ranking_model: RankingModel = self.get_ranking_model( loss_key=self.args.loss_key, feature_config=feature_config, metrics_keys=metrics_keys) overall_metrics, _ = ranking_model.evaluate( test_dataset=relevance_dataset.test, logs_dir=self.args.logs_dir, ) return overall_metrics.to_dict()
def run_default_pipeline(self, loss_key: str): """Train a model with the default set of args""" feature_config_path = os.path.join(self.root_data_dir, "tfrecord", self.feature_config_fname) feature_config: FeatureConfig = parse_config( tfrecord_type=self.args.tfrecord_type, feature_config=feature_config_path, logger=self.logger, ) data_dir = os.path.join(self.root_data_dir, "tfrecord") data_format = "tfrecord" metrics_keys = ["MRR"] relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, logger=self.logger, ) ranking_model: RankingModel = self.get_ranking_model( loss_key=loss_key, feature_config=feature_config, metrics_keys=metrics_keys) metrics = ranking_model.model.evaluate(relevance_dataset.test) return dict(zip(ranking_model.model.metrics_names, metrics))["loss"]
def main(argv): """Convert CSV files into tfrecord SequenceExample files""" # Define script arguments parser = ArgumentParser( description="Process arguments for ml4ir ranking pipeline.") parser.add_argument("--csv_dir", type=str, default=None, help="Path to the data directory containing CSV files") parser.add_argument("--csv_file", type=str, default=None, help="Path to the CSV file to convert") parser.add_argument( "--tfrecord_dir", type=str, default=None, help="Path to the output directory to write TFRecord files", ) parser.add_argument( "--tfrecord_file", type=str, default=None, help="Path to the output file to write TFRecord data", ) parser.add_argument( "--feature_config", type=str, default=None, help="Path to feature config JSON file or feature config JSON string", ) parser.add_argument( "--convert_single_files", type=bool, default=False, help="Whether to convert each CSV file individually" "All occurences of a query key should be within a single file", ) args = parser.parse_args(argv) # Get all CSV files to be converted if args.csv_dir: csv_files: List[str] = glob.glob(os.path.join(args.csv_dir, "*.csv")) else: csv_files: List[str] = [args.csv_file] feature_config: FeatureConfig = parse_config(args.feature_config) # Setup logging logger: Logger = setup_logging() # Convert to TFRecord SequenceExample protobufs and save file_count = 0 if args.convert_single_files: # Convert each CSV file individually - better performance for csv_file in csv_files: if args.tfrecord_dir: tfrecord_file: str = os.path.join( args.tfrecord_dir, "file_{}.tfrecord".format(file_count)) else: tfrecord_file: str = args.tfrecord_file write_from_files( csv_files=[csv_file], tfrecord_file=tfrecord_file, feature_config=feature_config, logger=logger, ) file_count += 1 else: # Convert all CSV files at once - expensive groupby operation if args.tfrecord_dir: tfrecord_file: str = os.path.join( args.tfrecord_dir, "file_{}.tfrecord".format(file_count)) else: tfrecord_file: str = args.tfrecord_file write_from_files( csv_files=csv_files, tfrecord_file=tfrecord_file, feature_config=feature_config, logger=logger, )