Esempio n. 1
0
    def __init__(self, args: Namespace):
        self.args = args

        # Generate Run ID
        if len(self.args.run_id) > 0:
            self.run_id: str = self.args.run_id
        else:
            self.run_id = "-".join(
                [socket.gethostname(),
                 time.strftime("%Y%m%d-%H%M%S")])
        self.start_time = time.time()

        self.logs_dir: str = os.path.join(self.args.logs_dir, self.run_id)

        # Setup logging
        file_io.make_directory(self.logs_dir, clear_dir=True, log=None)
        self.logger: Logger = self.setup_logging()
        self.logger.info("Logging initialized. Saving logs to : {}".format(
            self.logs_dir))
        self.logger.info("Run ID: {}".format(self.run_id))
        self.logger.info("CLI args: \n{}".format(
            json.dumps(vars(self.args)).replace(",", "\n")))

        # Setup directories
        self.models_dir: str = os.path.join(self.args.models_dir, self.run_id)
        self.data_dir: str = self.args.data_dir
        file_io.make_directory(self.models_dir,
                               clear_dir=False,
                               log=self.logger)

        # Read/Parse model config YAML
        self.model_config_file = self.args.model_config

        # Setup other arguments
        self.loss_key: str = self.args.loss_key
        self.optimizer_key: str = self.args.optimizer_key
        if self.args.metrics_keys[0] == "[":
            self.metrics_keys: List[str] = ast.literal_eval(
                self.args.metrics_keys)
        else:
            self.metrics_keys = [self.args.metrics_keys]
        self.data_format: str = self.args.data_format
        self.tfrecord_type: str = self.args.tfrecord_type

        # Validate args
        self.validate_args()

        # Set random seeds
        self.set_seeds()

        # Load and parse feature config
        self.feature_config: FeatureConfig = parse_config(
            tfrecord_type=self.tfrecord_type,
            feature_config=self.args.feature_config,
            logger=self.logger,
        )
        self.logger.info("Feature config parsed and loaded")

        # Finished initialization
        self.logger.info("Relevance Pipeline successfully initialized!")
Esempio n. 2
0
    def get_ranking_dataset(self, data_dir: str, data_format: str,
                            feature_config_path: str):

        feature_config: FeatureConfig = parse_config(
            tfrecord_type=self.args.tfrecord_type,
            feature_config=feature_config_path,
            logger=self.logger,
        )

        relevance_dataset = RelevanceDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns={},
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            logger=self.logger,
        )

        return relevance_dataset
Esempio n. 3
0
    def get_feature_config(self):
        feature_config_path = os.path.join(self.root_data_dir, "tfrecord",
                                           self.feature_config_fname)

        feature_config: FeatureConfig = parse_config(
            tfrecord_type=self.args.tfrecord_type,
            feature_config=feature_config_path,
            logger=self.logger,
        )

        return feature_config
Esempio n. 4
0
    def run_default_pipeline(self, data_dir: str, data_format: str, feature_config_path: str):
        """Train a model with the default set of args"""
        metrics_keys = ["MRR"]

        # Fix random seed values for repeatability
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)

        feature_config: FeatureConfig = parse_config(
            tfrecord_type=self.args.tfrecord_type,
            feature_config=feature_config_path,
            logger=self.logger,
        )

        relevance_dataset = RelevanceDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns={},
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            logger=self.logger,
        )

        ranking_model: RankingModel = self.get_ranking_model(
            loss_key=self.args.loss_key, feature_config=feature_config, metrics_keys=metrics_keys
        )

        ranking_model.fit(dataset=relevance_dataset, num_epochs=1, models_dir=self.output_dir)

        loss = dict(
            zip(
                ranking_model.model.metrics_names,
                ranking_model.model.evaluate(relevance_dataset.test),
            )
        )["loss"]
        new_MRR = ranking_model.evaluate(
            test_dataset=relevance_dataset.test, logs_dir=self.args.logs_dir,
        )[0]["new_MRR"]

        return loss, new_MRR
Esempio n. 5
0
def run_dataset_creation(data_dir: str = DATA_DIR,
                         out_dir: str = OUT_DIR,
                         feature_config: str = FEATURE_CONFIG,
                         feature_highval: dict = FEATURE_HIGHVAL,
                         feature_num_results: str = FEATURE_NUM_RESULTS,
                         max_num_records: int = MAX_NUM_RECORDS,
                         num_samples: int = NUM_SAMPLES,
                         random_state: int = RANDOM_STATE):
    """
    1. Loads example data
    2. Builds specified synthetic data size by sampling from example data
    3. Adds catastrophic failures specifically
    4. For now, write out to CSV. In future could return df directly
    """
    # Setup logging
    logger: Logger = setup_logging()

    try:
        # Set seeds
        set_seeds(random_state)
        logger.info(
            'Set seeds with initial random state {}'.format(random_state))

        # Load and parse feature config
        feature_config: FeatureConfig = parse_config(
            tfrecord_type='', feature_config=feature_config, logger=logger)
        logger.info("Feature config parsed and loaded")

        # Create output location
        file_io.make_directory(out_dir, log=logger)
        out_file = os.path.join(
            out_dir, 'synthetic_data_{}.csv'.format(
                dt.datetime.now().strftime('%Y%m%d-%H%M%S')))

        # Build data
        seed_data = load_seed_data(data_dir, logger)

        df_synthetic = fill_data(seed_data, max_num_records, feature_config,
                                 feature_highval, feature_num_results,
                                 num_samples, logger)
        file_io.write_df(df_synthetic, outfile=out_file, index=False)
        logger.info('Synthetic data created! Location: {}'.format(out_file))
        return df_synthetic

    except Exception as e:
        logger.error("!!! Error creating synthetic data: !!!\n{}".format(
            str(e)))
        traceback.print_exc()
        return
Esempio n. 6
0
    def run_default_pipeline(self, data_dir: str, data_format: str,
                             feature_config_path: str):
        """Train a model with the default set of args"""
        feature_config: FeatureConfig = parse_config(
            tfrecord_type=self.args.tfrecord_type,
            feature_config=feature_config_path,
            logger=self.logger,
        )
        data_dir = os.path.join(self.root_data_dir, "tfrecord")
        data_format = "tfrecord"

        metrics_keys = ["categorical_accuracy", "MRR", "ACR"]

        relevance_dataset = RelevanceDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns={},
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            logger=self.logger,
        )

        ranking_model: RankingModel = self.get_ranking_model(
            loss_key=self.args.loss_key,
            feature_config=feature_config,
            metrics_keys=metrics_keys)

        overall_metrics, _ = ranking_model.evaluate(
            test_dataset=relevance_dataset.test,
            logs_dir=self.args.logs_dir,
        )

        return overall_metrics.to_dict()
Esempio n. 7
0
    def run_default_pipeline(self, loss_key: str):
        """Train a model with the default set of args"""
        feature_config_path = os.path.join(self.root_data_dir, "tfrecord",
                                           self.feature_config_fname)
        feature_config: FeatureConfig = parse_config(
            tfrecord_type=self.args.tfrecord_type,
            feature_config=feature_config_path,
            logger=self.logger,
        )
        data_dir = os.path.join(self.root_data_dir, "tfrecord")
        data_format = "tfrecord"

        metrics_keys = ["MRR"]

        relevance_dataset = RelevanceDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns={},
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            logger=self.logger,
        )

        ranking_model: RankingModel = self.get_ranking_model(
            loss_key=loss_key,
            feature_config=feature_config,
            metrics_keys=metrics_keys)

        metrics = ranking_model.model.evaluate(relevance_dataset.test)
        return dict(zip(ranking_model.model.metrics_names, metrics))["loss"]
Esempio n. 8
0
def main(argv):
    """Convert CSV files into tfrecord SequenceExample files"""

    # Define script arguments
    parser = ArgumentParser(
        description="Process arguments for ml4ir ranking pipeline.")

    parser.add_argument("--csv_dir",
                        type=str,
                        default=None,
                        help="Path to the data directory containing CSV files")
    parser.add_argument("--csv_file",
                        type=str,
                        default=None,
                        help="Path to the CSV file to convert")
    parser.add_argument(
        "--tfrecord_dir",
        type=str,
        default=None,
        help="Path to the output directory to write TFRecord files",
    )
    parser.add_argument(
        "--tfrecord_file",
        type=str,
        default=None,
        help="Path to the output file to write TFRecord data",
    )
    parser.add_argument(
        "--feature_config",
        type=str,
        default=None,
        help="Path to feature config JSON file or feature config JSON string",
    )
    parser.add_argument(
        "--convert_single_files",
        type=bool,
        default=False,
        help="Whether to convert each CSV file individually"
        "All occurences of a query key should be within a single file",
    )
    args = parser.parse_args(argv)

    # Get all CSV files to be converted
    if args.csv_dir:
        csv_files: List[str] = glob.glob(os.path.join(args.csv_dir, "*.csv"))
    else:
        csv_files: List[str] = [args.csv_file]

    feature_config: FeatureConfig = parse_config(args.feature_config)

    # Setup logging
    logger: Logger = setup_logging()

    # Convert to TFRecord SequenceExample protobufs and save
    file_count = 0
    if args.convert_single_files:
        # Convert each CSV file individually - better performance
        for csv_file in csv_files:
            if args.tfrecord_dir:
                tfrecord_file: str = os.path.join(
                    args.tfrecord_dir, "file_{}.tfrecord".format(file_count))
            else:
                tfrecord_file: str = args.tfrecord_file

            write_from_files(
                csv_files=[csv_file],
                tfrecord_file=tfrecord_file,
                feature_config=feature_config,
                logger=logger,
            )

            file_count += 1
    else:
        # Convert all CSV files at once - expensive groupby operation
        if args.tfrecord_dir:
            tfrecord_file: str = os.path.join(
                args.tfrecord_dir, "file_{}.tfrecord".format(file_count))
        else:
            tfrecord_file: str = args.tfrecord_file

        write_from_files(
            csv_files=csv_files,
            tfrecord_file=tfrecord_file,
            feature_config=feature_config,
            logger=logger,
        )