Beispiel #1
0
    def get_ranking_dataset(self, data_dir: str, data_format: str,
                            feature_config_path: str):

        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=self.args.tfrecord_type,
            feature_config_dict=self.file_io.read_yaml(feature_config_path),
            logger=self.logger,
        )

        relevance_dataset = RelevanceDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns={},
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.file_io,
            logger=self.logger,
        )

        return relevance_dataset
Beispiel #2
0
    def test_linear_ranking_model_save(self):
        """
        Test the save functionality of LinearRankingModel.
        Specifically, we test to see if the features and coefficients have been saved as CSV file.
        """
        feature_config_path = os.path.join(self.root_data_dir, "configs/linear_model", self.feature_config_fname)
        self.load_model_config(os.path.join(self.root_data_dir, "configs/linear_model", "model_config.yaml"))
        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=self.args.tfrecord_type,
            feature_config_dict=self.file_io.read_yaml(feature_config_path),
            logger=self.logger,
        )

        ranking_model: RankingModel = self.get_ranking_model(
            loss_key=self.args.loss_key,
            feature_config=feature_config,
            metrics_keys=["MRR"]
        )

        # Save the model and check if coefficients file was saved
        ranking_model.save(models_dir=self.args.models_dir)
        assert os.path.exists(os.path.join(self.args.models_dir, "coefficients.csv"))

        # Check coefficients for all features were saved
        coefficients_df = pd.read_csv(
            os.path.join(self.args.models_dir, "coefficients.csv"))
        train_features = set(feature_config.get_train_features("node_name"))

        assert len(train_features) == coefficients_df.shape[0]
        for train_feature in train_features:
            assert train_feature in coefficients_df.feature.values
    def get_feature_config(self):
        feature_config_path = os.path.join(self.root_data_dir, "config", self.feature_config_fname)

        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=self.args.tfrecord_type,
            feature_config_dict=self.file_io.read_yaml(feature_config_path),
            logger=self.logger,
        )

        return feature_config
Beispiel #4
0
    def run_default_pipeline(self, data_dir: str, data_format: str,
                             feature_config_path: str):
        """Train a model with the default set of args"""
        metrics_keys = ["MRR"]

        # Fix random seed values for repeatability
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)

        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=self.args.tfrecord_type,
            feature_config_dict=self.file_io.read_yaml(feature_config_path),
            logger=self.logger,
        )

        relevance_dataset = RelevanceDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns={},
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.file_io,
            logger=self.logger,
        )

        ranking_model: RankingModel = self.get_ranking_model(
            loss_key=self.args.loss_key,
            feature_config=feature_config,
            metrics_keys=metrics_keys)

        ranking_model.fit(dataset=relevance_dataset,
                          num_epochs=1,
                          models_dir=self.output_dir)

        loss = dict(
            zip(
                ranking_model.model.metrics_names,
                ranking_model.model.evaluate(relevance_dataset.test),
            ))["loss"]
        new_MRR = ranking_model.evaluate(
            test_dataset=relevance_dataset.test,
            logs_dir=self.args.logs_dir,
        )[0]["new_MRR"]

        return loss, new_MRR
Beispiel #5
0
    def get_ranking_dataset_and_model(self,
                                      seed=123,
                                      initialize_layers_dict={},
                                      freeze_layers_list=[]):
        """Helper method to get a RankingModel and Dataset with some default args"""
        data_dir = os.path.join(self.root_data_dir, DataFormatKey.TFRECORD)
        feature_config_path = os.path.join(self.root_data_dir, "configs",
                                           self.feature_config_fname)
        data_format = DataFormatKey.TFRECORD
        metrics_keys = [MetricKey.MRR]

        # Fix random seed values for repeatability
        tf.keras.backend.clear_session()
        np.random.seed(seed)
        tf.random.set_seed(seed)
        random.seed(seed)

        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=self.args.tfrecord_type,
            feature_config_dict=self.file_io.read_yaml(feature_config_path),
            logger=self.logger,
        )

        relevance_dataset = RelevanceDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns={},
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.file_io,
            logger=self.logger,
        )

        ranking_model: RankingModel = self.get_ranking_model(
            loss_key=self.args.loss_key,
            feature_config=feature_config,
            metrics_keys=metrics_keys,
            initialize_layers_dict=initialize_layers_dict,
            freeze_layers_list=freeze_layers_list,
        )

        return ranking_model, relevance_dataset
    def setUp(self):
        file_io = LocalIO()
        logger = logging.getLogger()

        self.dataset = tf.data.TFRecordDataset(DATASET_PATH)
        self.proto = next(iter(self.dataset))
        self.feature_config = FeatureConfig.get_instance(
            tfrecord_type=TFRecordTypeKey.EXAMPLE,
            feature_config_dict=file_io.read_yaml(FEATURE_CONFIG_PATH),
            logger=logger,
        )
        self.parser = TFRecordExampleParser(
            feature_config=self.feature_config,
            preprocessing_map=PreprocessingMap(),
            required_fields_only=False,
        )
Beispiel #7
0
def main(args):
    """Convert CSV files into tfrecord Example/SequenceExample files"""
    # Setup logging
    logger: Logger = setup_logging()
    file_io = LocalIO(logger)

    # Get all CSV files to be converted, depending on user's arguments
    if args.csv_dir:
        csv_files: List[str] = file_io.get_files_in_directory(
            indir=args.csv_dir, extension="*.csv")
    else:
        csv_files: List[str] = args.csv_files

    # Load feat config

    feature_config: FeatureConfig = FeatureConfig.get_instance(
        tfrecord_type=MODES[args.tfmode],
        feature_config_dict=file_io.read_yaml(args.feature_config),
        logger=logger,
    )

    # Convert to TFRecord SequenceExample protobufs and save
    if args.keep_single_files:
        # Convert each CSV file individually - better performance
        for csv_file in csv_files:
            tfrecord_file: str = os.path.basename(csv_file).replace(".csv", "")
            tfrecord_file: str = os.path.join(
                args.out_dir, "{}.tfrecord".format(tfrecord_file))
            write_from_files(
                csv_files=[csv_file],
                tfrecord_file=tfrecord_file,
                feature_config=feature_config,
                logger=logger,
                tfrecord_type=MODES[args.tfmode],
            )

    else:
        # Convert all CSV files at once - expensive groupby operation
        tfrecord_file: str = os.path.join(args.out_dir, "combined.tfrecord")
        write_from_files(
            csv_files=csv_files,
            tfrecord_file=tfrecord_file,
            feature_config=feature_config,
            logger=logger,
            tfrecord_type=MODES[args.tfmode],
            file_io=file_io,
        )
Beispiel #8
0
    def run_default_pipeline(self, data_dir: str, data_format: str,
                             feature_config_path: str):
        """Train a model with the default set of args"""
        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=self.args.tfrecord_type,
            feature_config_dict=self.file_io.read_yaml(feature_config_path),
            logger=self.logger,
        )
        data_dir = os.path.join(self.root_data_dir, "tfrecord")
        data_format = "tfrecord"

        metrics_keys = ["categorical_accuracy", "MRR", "ACR"]

        relevance_dataset = RelevanceDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns={},
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.file_io,
            logger=self.logger,
        )

        ranking_model: RankingModel = self.get_ranking_model(
            loss_key=self.args.loss_key,
            feature_config=feature_config,
            metrics_keys=metrics_keys)

        overall_metrics, _ = ranking_model.evaluate(
            test_dataset=relevance_dataset.test,
            logs_dir=self.args.logs_dir,
        )

        return overall_metrics.to_dict()
Beispiel #9
0
    def run_default_pipeline(self, loss_key: str):
        """Train a model with the default set of args"""
        feature_config_path = os.path.join(self.root_data_dir, "configs",
                                           self.feature_config_fname)
        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=self.args.tfrecord_type,
            feature_config_dict=self.file_io.read_yaml(feature_config_path),
            logger=self.logger,
        )
        data_dir = os.path.join(self.root_data_dir, "tfrecord")
        data_format = "tfrecord"

        metrics_keys = ["MRR"]

        relevance_dataset = RelevanceDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns={},
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.file_io,
            logger=self.logger,
        )

        ranking_model: RankingModel = self.get_ranking_model(
            loss_key=loss_key,
            feature_config=feature_config,
            metrics_keys=metrics_keys)

        metrics = ranking_model.model.evaluate(relevance_dataset.test)
        return dict(zip(ranking_model.model.metrics_names, metrics))["loss"]
Beispiel #10
0
    def __init__(self, args: Namespace):
        """
        Constructor to create a RelevancePipeline object to train, evaluate
        and save a model on ml4ir.
        This method sets up data, logs, models directories, file handlers used.
        The method also loads and sets up the FeatureConfig for the model training
        pipeline

        Parameters
        ----------
        args: argparse Namespace
            arguments to be used with the pipeline.
            Typically, passed from command line arguments
        """
        self.args = args

        # Generate Run ID
        if len(self.args.run_id) > 0:
            self.run_id: str = self.args.run_id
        else:
            self.run_id = "-".join(
                [socket.gethostname(),
                 time.strftime("%Y%m%d-%H%M%S")])
        self.start_time = time.time()

        # Setup directories
        self.local_io = LocalIO()
        self.models_dir_hdfs = None
        self.logs_dir_hdfs = None
        self.data_dir_hdfs = None
        if self.args.file_handler == FileHandlerKey.SPARK:
            self.models_dir = os.path.join(self.args.models_dir, self.run_id)
            self.logs_dir = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir = self.args.data_dir

            self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS,
                                                 self.run_id)
            self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS,
                                               self.run_id)
            self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA,
                                               os.path.basename(self.data_dir))
        else:
            self.models_dir_local = os.path.join(self.args.models_dir,
                                                 self.run_id)
            self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir_local = self.args.data_dir

        # Setup logging
        self.local_io.make_directory(self.logs_dir_local, clear_dir=True)
        self.logger: Logger = self.setup_logging()
        self.logger.info("Logging initialized. Saving logs to : {}".format(
            self.logs_dir_local))
        self.logger.info("Run ID: {}".format(self.run_id))
        self.logger.debug("CLI args: \n{}".format(
            json.dumps(vars(self.args), indent=4)))
        self.local_io.set_logger(self.logger)
        self.local_io.make_directory(self.models_dir_local, clear_dir=False)
        self.model_file = self.args.model_file

        # Set the file handlers and respective setup
        if self.args.file_handler == FileHandlerKey.LOCAL:
            self.file_io = self.local_io
        elif self.args.file_handler == FileHandlerKey.SPARK:
            self.file_io = SparkIO(self.logger)

            # Copy data dir from HDFS to local file system
            self.local_io.make_directory(
                dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True)
            self.file_io.copy_from_hdfs(self.data_dir,
                                        DefaultDirectoryKey.TEMP_DATA)

            # Copy model_file if present from HDFS to local file system
            if self.model_file:
                self.local_io.make_directory(
                    dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True)
                self.file_io.copy_from_hdfs(self.model_file,
                                            DefaultDirectoryKey.TEMP_MODELS)
                self.model_file = os.path.join(
                    DefaultDirectoryKey.TEMP_MODELS,
                    os.path.basename(self.model_file))

        # Read/Parse model config YAML
        self.model_config_file = self.args.model_config

        # Setup other arguments
        self.loss_key: str = self.args.loss_key
        if self.args.metrics_keys[0] == "[":
            self.metrics_keys: List[str] = ast.literal_eval(
                self.args.metrics_keys)
        else:
            self.metrics_keys = [self.args.metrics_keys]
        self.data_format: str = self.args.data_format
        self.tfrecord_type: str = self.args.tfrecord_type

        if args.data_format == DataFormatKey.RANKLIB:
            try:
                self.non_zero_features_only = self.args.non_zero_features_only
                self.keep_additional_info = self.args.keep_additional_info
            except KeyError:
                self.non_zero_features_only = 0
                self.keep_additional_info = 0
        else:
            self.non_zero_features_only = 0
            self.keep_additional_info = 0

        if args.model_file:
            self.model_file = args.model_file
        else:
            self.model_file = None

        # Validate args
        self.validate_args()

        # Set random seeds
        self.set_seeds()

        # Load and parse feature config
        self.feature_config: FeatureConfig = FeatureConfig.get_instance(
            feature_config_dict=self.file_io.read_yaml(
                self.args.feature_config),
            tfrecord_type=self.tfrecord_type,
            logger=self.logger,
        )

        # Finished initialization
        self.logger.info("Relevance Pipeline successfully initialized!")
Beispiel #11
0
    def __init__(self, args: Namespace):
        self.args = args

        # Generate Run ID
        if len(self.args.run_id) > 0:
            self.run_id: str = self.args.run_id
        else:
            self.run_id = "-".join(
                [socket.gethostname(),
                 time.strftime("%Y%m%d-%H%M%S")])
        self.start_time = time.time()

        # Setup directories
        self.local_io = LocalIO()
        self.models_dir_hdfs = None
        self.logs_dir_hdfs = None
        self.data_dir_hdfs = None
        if self.args.file_handler == FileHandlerKey.SPARK:
            self.models_dir = os.path.join(self.args.models_dir, self.run_id)
            self.logs_dir = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir = self.args.data_dir

            self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS,
                                                 self.run_id)
            self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS,
                                               self.run_id)
            self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA,
                                               os.path.basename(self.data_dir))
        else:
            self.models_dir_local = os.path.join(self.args.models_dir,
                                                 self.run_id)
            self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir_local = self.args.data_dir

        # Setup logging
        self.local_io.make_directory(self.logs_dir_local, clear_dir=True)
        self.logger: Logger = self.setup_logging()
        self.logger.info("Logging initialized. Saving logs to : {}".format(
            self.logs_dir_local))
        self.logger.info("Run ID: {}".format(self.run_id))
        self.logger.debug("CLI args: \n{}".format(
            json.dumps(vars(self.args), indent=4)))
        self.local_io.set_logger(self.logger)
        self.local_io.make_directory(self.models_dir_local, clear_dir=False)
        self.model_file = self.args.model_file

        # Set the file handlers and respective setup
        if self.args.file_handler == FileHandlerKey.LOCAL:
            self.file_io = self.local_io
        elif self.args.file_handler == FileHandlerKey.SPARK:
            self.file_io = SparkIO(self.logger)

            # Copy data dir from HDFS to local file system
            self.local_io.make_directory(
                dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True)
            self.file_io.copy_from_hdfs(self.data_dir,
                                        DefaultDirectoryKey.TEMP_DATA)

            # Copy model_file if present from HDFS to local file system
            if self.model_file:
                self.local_io.make_directory(
                    dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True)
                self.file_io.copy_from_hdfs(self.model_file,
                                            DefaultDirectoryKey.TEMP_MODELS)
                self.model_file = os.path.join(
                    DefaultDirectoryKey.TEMP_MODELS,
                    os.path.basename(self.model_file))

        # Read/Parse model config YAML
        self.model_config_file = self.args.model_config

        # Setup other arguments
        self.loss_key: str = self.args.loss_key
        self.optimizer_key: str = self.args.optimizer_key
        if self.args.metrics_keys[0] == "[":
            self.metrics_keys: List[str] = ast.literal_eval(
                self.args.metrics_keys)
        else:
            self.metrics_keys = [self.args.metrics_keys]
        self.data_format: str = self.args.data_format
        self.tfrecord_type: str = self.args.tfrecord_type

        # Validate args
        self.validate_args()

        # Set random seeds
        self.set_seeds()

        # Load and parse feature config
        self.feature_config: FeatureConfig = FeatureConfig.get_instance(
            feature_config_dict=self.file_io.read_yaml(
                self.args.feature_config),
            tfrecord_type=self.tfrecord_type,
            logger=self.logger,
        )

        # Finished initialization
        self.logger.info("Relevance Pipeline successfully initialized!")
Beispiel #12
0
def run_dataset_creation(
    data_dir: str = DATA_DIR,
    out_dir: str = OUT_DIR,
    feature_config_path: str = FEATURE_CONFIG,
    feature_highval: dict = FEATURE_HIGHVAL,
    feature_num_results: str = FEATURE_NUM_RESULTS,
    max_num_records: int = MAX_NUM_RECORDS,
    num_samples: int = NUM_SAMPLES,
    random_state: int = RANDOM_STATE,
):
    """
    1. Loads example data
    2. Builds specified synthetic data size by sampling from example data
    3. Adds catastrophic failures specifically
    4. For now, write out to CSV. In future could return df directly
    """
    # Setup logging
    file_io = LocalIO()
    logger: Logger = setup_logging(file_io)
    file_io.set_logger(logger)

    try:
        # Set seeds
        set_seeds(random_state)
        logger.info(
            "Set seeds with initial random state {}".format(random_state))

        # Load and parse feature config
        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            feature_config_dict=file_io.read_yaml(feature_config_path),
            logger=logger,
        )
        logger.info("Feature config parsed and loaded")

        # Create output location
        file_io.make_directory(out_dir)
        out_file = os.path.join(
            out_dir, "synthetic_data_{}.csv".format(
                dt.datetime.now().strftime("%Y%m%d-%H%M%S")))

        # Build data
        seed_data = load_seed_data(data_dir, logger, file_io)

        df_synthetic = fill_data(
            seed_data,
            max_num_records,
            feature_config,
            feature_highval,
            feature_num_results,
            num_samples,
            logger,
        )
        file_io.write_df(df_synthetic, outfile=out_file, index=False)
        logger.info("Synthetic data created! Location: {}".format(out_file))
        return df_synthetic

    except Exception as e:
        logger.error("!!! Error creating synthetic data: !!!\n{}".format(
            str(e)))
        traceback.print_exc()
        return