Esempio n. 1
0
    def setUp(
        self,
        output_dir: str = OUTPUT_DIR,
        root_data_dir: str = ROOT_DATA_DIR,
        feature_config_fname: str = FEATURE_CONFIG_FNAME,
    ):
        self.output_dir = output_dir
        self.root_data_dir = root_data_dir
        self.feature_config_fname = feature_config_fname

        # Make temp output directory
        file_io.make_directory(self.output_dir, clear_dir=True)

        # Fix random seed values for repeatability
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)

        # Setup arguments
        self.args: Namespace = get_args([])
        self.args.models_dir = output_dir
        self.args.logs_dir = output_dir

        # Load model_config
        self.model_config = file_io.read_yaml(self.args.model_config)

        # Setup logging
        outfile: str = os.path.join(self.args.logs_dir, "output_log.csv")

        self.logger = setup_logging(reset=True,
                                    file_name=outfile,
                                    log_to_file=True)
Esempio n. 2
0
    def __init__(self, args: Namespace):
        self.args = args

        # Generate Run ID
        if len(self.args.run_id) > 0:
            self.run_id: str = self.args.run_id
        else:
            self.run_id = "-".join(
                [socket.gethostname(),
                 time.strftime("%Y%m%d-%H%M%S")])
        self.start_time = time.time()

        self.logs_dir: str = os.path.join(self.args.logs_dir, self.run_id)

        # Setup logging
        file_io.make_directory(self.logs_dir, clear_dir=True, log=None)
        self.logger: Logger = self.setup_logging()
        self.logger.info("Logging initialized. Saving logs to : {}".format(
            self.logs_dir))
        self.logger.info("Run ID: {}".format(self.run_id))
        self.logger.info("CLI args: \n{}".format(
            json.dumps(vars(self.args)).replace(",", "\n")))

        # Setup directories
        self.models_dir: str = os.path.join(self.args.models_dir, self.run_id)
        self.data_dir: str = self.args.data_dir
        file_io.make_directory(self.models_dir,
                               clear_dir=False,
                               log=self.logger)

        # Read/Parse model config YAML
        self.model_config_file = self.args.model_config

        # Setup other arguments
        self.loss_key: str = self.args.loss_key
        self.optimizer_key: str = self.args.optimizer_key
        if self.args.metrics_keys[0] == "[":
            self.metrics_keys: List[str] = ast.literal_eval(
                self.args.metrics_keys)
        else:
            self.metrics_keys = [self.args.metrics_keys]
        self.data_format: str = self.args.data_format
        self.tfrecord_type: str = self.args.tfrecord_type

        # Validate args
        self.validate_args()

        # Set random seeds
        self.set_seeds()

        # Load and parse feature config
        self.feature_config: FeatureConfig = parse_config(
            tfrecord_type=self.tfrecord_type,
            feature_config=self.args.feature_config,
            logger=self.logger,
        )
        self.logger.info("Feature config parsed and loaded")

        # Finished initialization
        self.logger.info("Relevance Pipeline successfully initialized!")
Esempio n. 3
0
def setup_logging():
    run_id = "-".join([socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")])
    logs_dir: str = os.path.join('logs', run_id)
    file_io.make_directory(logs_dir, clear_dir=True, log=None)

    outfile: str = os.path.join(logs_dir, "output_log.csv")
    logger = logging_utils.setup_logging(reset=True,
                                         file_name=outfile,
                                         log_to_file=True)

    logger.info('Logging initialized. Saving logs to : {}'.format(logs_dir))
    logger.info('Run ID: {}'.format(run_id))
    return logger
Esempio n. 4
0
def run_dataset_creation(data_dir: str = DATA_DIR,
                         out_dir: str = OUT_DIR,
                         feature_config: str = FEATURE_CONFIG,
                         feature_highval: dict = FEATURE_HIGHVAL,
                         feature_num_results: str = FEATURE_NUM_RESULTS,
                         max_num_records: int = MAX_NUM_RECORDS,
                         num_samples: int = NUM_SAMPLES,
                         random_state: int = RANDOM_STATE):
    """
    1. Loads example data
    2. Builds specified synthetic data size by sampling from example data
    3. Adds catastrophic failures specifically
    4. For now, write out to CSV. In future could return df directly
    """
    # Setup logging
    logger: Logger = setup_logging()

    try:
        # Set seeds
        set_seeds(random_state)
        logger.info(
            'Set seeds with initial random state {}'.format(random_state))

        # Load and parse feature config
        feature_config: FeatureConfig = parse_config(
            tfrecord_type='', feature_config=feature_config, logger=logger)
        logger.info("Feature config parsed and loaded")

        # Create output location
        file_io.make_directory(out_dir, log=logger)
        out_file = os.path.join(
            out_dir, 'synthetic_data_{}.csv'.format(
                dt.datetime.now().strftime('%Y%m%d-%H%M%S')))

        # Build data
        seed_data = load_seed_data(data_dir, logger)

        df_synthetic = fill_data(seed_data, max_num_records, feature_config,
                                 feature_highval, feature_num_results,
                                 num_samples, logger)
        file_io.write_df(df_synthetic, outfile=out_file, index=False)
        logger.info('Synthetic data created! Location: {}'.format(out_file))
        return df_synthetic

    except Exception as e:
        logger.error("!!! Error creating synthetic data: !!!\n{}".format(
            str(e)))
        traceback.print_exc()
        return
Esempio n. 5
0
    def setUp(self,
              root_data_dir: str = ROOT_DATA_DIR,
              feature_config: str = FEATURE_CONFIG,
              output_dir: str = OUTPUT_DIR,
              log_dir: str = LOG_DIR):
        self.root_data_dir = root_data_dir
        self.feature_config = feature_config
        self.output_dir = output_dir
        self.log_dir = log_dir

        # Set up logging
        file_io.make_directory(self.log_dir, clear_dir=True)
        outfile: str = os.path.join(self.log_dir, "output_log.csv")
        self.logger = setup_logging(reset=True,
                                    file_name=outfile,
                                    log_to_file=True)
Esempio n. 6
0
def read(data_dir: str,
         feature_config: FeatureConfig,
         tfrecord_type: str,
         tfrecord_dir: str,
         batch_size: int = 128,
         preprocessing_keys_to_fns: dict = {},
         use_part_files: bool = False,
         max_sequence_size: int = 25,
         parse_tfrecord: bool = True,
         logger=None,
         **kwargs) -> tf.data.TFRecordDataset:
    """
    - reads csv-formatted data from an input directory
    - selects relevant features
    - creates Dataset X and y

    Current execution plan:
        1. Load CSVs as pandas dataframes
        2. Convert each query into tf.train.SequenceExample protobufs
        3. Write the protobufs into a .tfrecord file
        4. Load .tfrecord file into a TFRecordDataset and parse the protobufs

    Args:
        - data_dir: Path to directory containing csv files to read
        - feature_config: ml4ir.config.features.FeatureConfig object extracted from the feature config
        - tfrecord_dir: Path to directory where the serialized .tfrecord files will be stored
        - batch_size: int value specifying the size of the batch
        - use_part_files: bool value specifying whether to look for part files
        - max_sequence_size: int value specifying max number of records per query
        - logger: logging object

    Returns:
        tensorflow TFRecordDataset
    """
    csv_files: List[str] = file_io.get_files_in_directory(
        data_dir,
        extension="" if use_part_files else ".csv",
        prefix="part-" if use_part_files else "",
    )

    # Create a directory for storing tfrecord files
    file_io.make_directory(tfrecord_dir, clear_dir=True)

    # Write tfrecord files
    tfrecord_writer.write_from_files(
        csv_files=csv_files,
        tfrecord_file=os.path.join(tfrecord_dir, TFRECORD_FILE),
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        logger=logger,
    )

    dataset = tfrecord_reader.read(
        data_dir=tfrecord_dir,
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        max_sequence_size=max_sequence_size,
        batch_size=batch_size,
        preprocessing_keys_to_fns=preprocessing_keys_to_fns,
        parse_tfrecord=parse_tfrecord,
        logger=logger,
    )

    return dataset