def dict_to_csv(data_dic: dict,
                root_dir: str,
                file_io: FileIO,
                zip_output: bool = True) -> str:
    """Saves input dictionary to a csv file and zips if requested
    Parameters
    ---------
        data_dic: dict
                input dict to be converted to a zipped csv file
        root_dir: str
                path to save the output file
        file_io: FileIO
                file I/O handler objects for reading and writing data
        zip_output: bool
                boolean value indicates whether the output should be zipped
    Returns
    -------
        `str`
         path to the created zip file
    """
    # creating zip dir
    final_dir_path = os.path.join(root_dir, TEMPERATURE_SCALE)
    file_io.make_directory(final_dir_path)

    csv_path = os.path.join(final_dir_path, f'{TEMPERATURE_SCALE}.csv')

    # creating .csv
    pd.DataFrame.from_dict(data_dic).to_csv(csv_path, index=False)
    if zip_output:
        # creating .zip file
        shutil.make_archive(final_dir_path, "zip", root_dir, TEMPERATURE_SCALE)

        # removing the dir, keeping only the zip
        shutil.rmtree(final_dir_path)
    return final_dir_path
Beispiel #2
0
def write_from_files(
    csv_files: List[str],
    tfrecord_file: str,
    feature_config: FeatureConfig,
    tfrecord_type: str,
    file_io: FileIO,
    logger: Logger = None,
):
    """
    Converts data from CSV files into tfrecord data.
    Output data protobuf format -> train.SequenceExample

    Args:
        csv_files: list of csv file paths to read data from
        tfrecord_file: tfrecord file path to write the output
        feature_config: str path to YAML feature config or str YAML feature config
        tfrecord_type: TFRecordTypeKey.EXAMPLE or TFRecordTypeKey.SEQUENCE_EXAMPLE
        logger: logging object

    NOTE: This method should be moved out of ml4ir and into the preprocessing pipeline
    """

    # Read CSV data into a pandas dataframe
    df = file_io.read_df_list(csv_files)
    write_from_df(df, tfrecord_file, feature_config, tfrecord_type, logger)
def write_from_files(
    csv_files: List[str],
    tfrecord_file: str,
    feature_config: FeatureConfig,
    tfrecord_type: str,
    file_io: FileIO,
    logger: Logger = None,
):
    """
    Converts data from CSV files into tfrecord files

    Parameters
    ----------
    csv_files : list of str
        list of csv file paths to read data from
    tfrecord_file : str
        tfrecord file path to write the output
    feature_config : `FeatureConfig`
        FeatureConfig object that defines the features to be loaded in the dataset
        and the preprocessing functions to be applied to each of them
    tfrecord_type : {"example", "sequence_example"}
        Type of the TFRecord protobuf message to be used for TFRecordDataset
    logger : `Logger`, optional
        logging handler for status messages
    """

    # Read CSV data into a pandas dataframe
    df = file_io.read_df_list(csv_files)
    write_from_df(df, tfrecord_file, feature_config, tfrecord_type, logger)
Beispiel #4
0
def read(data_dir: str,
         feature_config: FeatureConfig,
         tfrecord_type: str,
         file_io: FileIO,
         max_sequence_size: int = 0,
         batch_size: int = 0,
         preprocessing_keys_to_fns: dict = {},
         parse_tfrecord: bool = True,
         use_part_files: bool = False,
         logger: Logger = None,
         **kwargs) -> data.TFRecordDataset:
    """
    - reads tfrecord data from an input directory
    - selects relevant features
    - creates X and y data

    Args:
        data_dir: Path to directory containing csv files to read
        feature_config: ml4ir.config.features.Features object extracted from the feature config
        tfrecord_type: either example or sequence_example
        batch_size: int value specifying the size of the batch
        preprocessing_keys_to_fns: dictionary mapping preprocessing keys in the feature_config to functions
        parse_tfrecord: whether to parse SequenceExamples into features
        logger: logging object

    Returns:
        tensorflow dataset
    """
    parse_fn = get_parse_fn(
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        preprocessing_keys_to_fns=preprocessing_keys_to_fns,
        max_sequence_size=max_sequence_size,
    )

    # Get all tfrecord files in directory
    tfrecord_files = file_io.get_files_in_directory(
        data_dir,
        extension="" if use_part_files else ".tfrecord",
        prefix="part-" if use_part_files else "",
    )

    # Parse the protobuf data to create a TFRecordDataset
    dataset = data.TFRecordDataset(tfrecord_files)
    if parse_tfrecord:
        dataset = dataset.map(parse_fn).apply(
            data.experimental.ignore_errors())

    # Create BatchedDataSet
    if batch_size:
        dataset = dataset.batch(batch_size, drop_remainder=True)

    if logger:
        logger.info(
            "Created TFRecordDataset from SequenceExample protobufs from {} files : {}"
            .format(len(tfrecord_files),
                    str(tfrecord_files)[:50]))

    return dataset
Beispiel #5
0
    def from_model_config_file(
        cls,
        model_config_file: str,
        interaction_model: InteractionModel,
        loss: RelevanceLossBase,
        file_io: FileIO,
        output_name: str = "score",
        feature_config: Optional[FeatureConfig] = None,
        logger: Optional[Logger] = None,
    ):
        """
        Get a Scorer object from a YAML model config file

        Parameters
        ----------
        model_config_file : str
            Path to YAML file defining the model layer configuration
        feature_config : `FeatureConfig` object
            FeatureConfig object defining the features and their configurations
        interaction_model : `InteractionModel` object
            InteractionModel that defines the feature transformation layers
            on the input model features
        loss : `RelevanceLossBase` object
            Relevance loss object that defines the final activation layer
            and the loss function for the model
        file_io : `FileIO` object
            FileIO object that handles read and write
        output_name : str, optional
            Name of the output that captures the score computed by the model
        logger: Logger, optional
            Logging handler

        Returns
        -------
        `ScorerBase` object
            ScorerBase object that computes the scores from the input features of the model
        """
        model_config = file_io.read_yaml(model_config_file)

        return cls(
            model_config=model_config,
            feature_config=feature_config,
            interaction_model=interaction_model,
            loss=loss,
            file_io=file_io,
            output_name=output_name,
            logger=logger
        )
Beispiel #6
0
    def from_model_config_file(
        cls,
        model_config_file: str,
        interaction_model: InteractionModel,
        loss: RelevanceLossBase,
        output_name: str,
        file_io: FileIO,
        logger: Optional[Logger] = None,
    ):
        model_config = file_io.read_yaml(model_config_file)

        return cls(
            model_config=model_config,
            interaction_model=interaction_model,
            loss=loss,
            output_name=output_name,
        )
Beispiel #7
0
def read(
    data_dir: str,
    feature_config: FeatureConfig,
    tfrecord_type: str,
    file_io: FileIO,
    max_sequence_size: int = 0,
    batch_size: int = 0,
    preprocessing_keys_to_fns: dict = {},
    parse_tfrecord: bool = True,
    use_part_files: bool = False,
    logger: Logger = None,
    **kwargs
) -> data.TFRecordDataset:
    """
    Extract features by reading and parsing TFRecord data
    and converting into a TFRecordDataset using the FeatureConfig

    Parameters
    ----------
    data_dir : str
        path to the directory containing train, validation and test data
    feature_config : `FeatureConfig` object
        FeatureConfig object that defines the features to be loaded in the dataset
        and the preprocessing functions to be applied to each of them
    tfrecord_type : {"example", "sequence_example"}
        Type of the TFRecord protobuf message to be used for TFRecordDataset
    file_io : `FileIO` object
        file I/O handler objects for reading and writing data
    max_sequence_size : int, optional
        maximum number of sequence to be used with a single SequenceExample proto message
        The data will be appropriately padded or clipped to fit the max value specified
    batch_size : int, optional
        size of each data batch
    preprocessing_keys_to_fns : dict of (str, function), optional
        dictionary of function names mapped to function definitions
        that can now be used for preprocessing while loading the
        TFRecordDataset to create the RelevanceDataset object
    use_part_files : bool, optional
        load dataset from part files checked using "part-" prefix
    parse_tfrecord : bool, optional
        parse the TFRecord string from the dataset;
        returns strings as is otherwise
    logger : `Logger`, optional
        logging handler for status messages

    Returns
    -------
    `TFRecordDataset`
        TFRecordDataset loaded from the `data_dir` specified using the FeatureConfig
    """
    parse_fn = get_parse_fn(
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        preprocessing_keys_to_fns=preprocessing_keys_to_fns,
        max_sequence_size=max_sequence_size,
    )

    # Get all tfrecord files in directory
    tfrecord_files = file_io.get_files_in_directory(
        data_dir,
        extension="" if use_part_files else ".tfrecord",
        prefix="part-" if use_part_files else "",
    )

    # Parse the protobuf data to create a TFRecordDataset
    dataset = data.TFRecordDataset(tfrecord_files)

    if parse_tfrecord:
        # Parallel calls set to AUTOTUNE: improved training performance by 40% with a classification model
        dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE).apply(
            data.experimental.ignore_errors()
        )

    # Create BatchedDataSet
    if batch_size:
        dataset = dataset.batch(batch_size, drop_remainder=True)

    if logger:
        logger.info(
            "Created TFRecordDataset from SequenceExample protobufs from {} files : {}".format(
                len(tfrecord_files), str(tfrecord_files)[:50]
            )
        )

    # We apply prefetch as it improved train/test/validation throughput by 30% in some real model training.
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return dataset
Beispiel #8
0
def read(data_dir: str,
         feature_config: FeatureConfig,
         tfrecord_type: str,
         tfrecord_dir: str,
         file_io: FileIO,
         batch_size: int = 128,
         preprocessing_keys_to_fns: dict = {},
         use_part_files: bool = False,
         max_sequence_size: int = 25,
         parse_tfrecord: bool = True,
         logger=None,
         **kwargs) -> tf.data.TFRecordDataset:
    """
    Create a TFRecordDataset from directory of CSV files using the FeatureConfig

    Current execution plan:
        1. Load CSVs as pandas dataframes
        2. Convert each query into tf.train.SequenceExample protobufs
        3. Write the protobufs into a .tfrecord file
        4. Load .tfrecord file into a TFRecordDataset and parse the protobufs

    Parameters
    data_dir : str
        Path to directory containing csv files to read
    feature_config : FeatureConfig object
        FeatureConfig object that defines the features to be loaded in the dataset
        and the preprocessing functions to be applied to each of them
    tfrecord_dir : str
        Path to directory where the serialized .tfrecord files will be stored
    batch_size : int
        value specifying the size of the data batch
    use_part_files : bool
        load dataset from part files checked using "part-" prefix
    max_sequence_size : int
        value specifying max number of records per query
    logger : Logger object
        logging handler to print and save status messages

    Returns
    -------
    `TFRecordDataset` object
        tensorflow TFRecordDataset loaded from the CSV file
    """
    csv_files: List[str] = file_io.get_files_in_directory(
        data_dir,
        extension="" if use_part_files else ".csv",
        prefix="part-" if use_part_files else "",
    )

    # Create a directory for storing tfrecord files
    file_io.make_directory(tfrecord_dir, clear_dir=True)

    # Write tfrecord files
    tfrecord_writer.write_from_files(
        csv_files=csv_files,
        tfrecord_file=os.path.join(tfrecord_dir, TFRECORD_FILE),
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        file_io=file_io,
        logger=logger,
    )

    dataset = tfrecord_reader.read(
        data_dir=tfrecord_dir,
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        max_sequence_size=max_sequence_size,
        batch_size=batch_size,
        preprocessing_keys_to_fns=preprocessing_keys_to_fns,
        parse_tfrecord=parse_tfrecord,
        file_io=file_io,
        logger=logger,
    )

    return dataset
Beispiel #9
0
def read(data_dir: str,
         feature_config: FeatureConfig,
         tfrecord_type: str,
         tfrecord_dir: str,
         file_io: FileIO,
         batch_size: int = 128,
         preprocessing_keys_to_fns: dict = {},
         use_part_files: bool = False,
         max_sequence_size: int = 25,
         parse_tfrecord: bool = True,
         logger=None,
         keep_additional_info=0,
         non_zero_features_only=1,
         **kwargs) -> tf.data.TFRecordDataset:
    """
    - reads ranklib-formatted data from an input directory
    - selects relevant features
    - creates Dataset X and y

    Current execution plan:
        1. Convert ranklib to a dataframe
        2. Convert each query into tf.train.SequenceExample protobufs
        3. Write the protobufs into a .tfrecord file
        4. Load .tfrecord file into a TFRecordDataset and parse the protobufs

        Parameters
        ----------
        data_dir: str
            Path to directory containing csv files to read
        feature_config: ml4ir.config.features.FeatureConfig object
            FeatureConfig object extracted from the feature config
        tfrecord_dir: str
            Path to directory where the serialized .tfrecord files will be stored
        batch_size: int
            Value specifying the size of the batch
        use_part_files: bool
            Value specifying whether to look for part files
        max_sequence_size: int
            Value specifying max number of records per query
        logger: logging object
            logging object
        keep_additional_info: int
            Option to keep additional info (All info after the "#") 1 to keep, 0 to ignore
        non_zero_features_only: int
            Only non zero features are stored. 1 for yes, 0 otherwise

        Returns
        -------
        tensorflow TFRecordDataset
            Processed dataset
    """
    ranklib_files: List[str] = file_io.get_files_in_directory(
        data_dir,
        extension="" if use_part_files else ".txt",
        prefix="part-" if use_part_files else "",
    )

    gl_2_clicks = False

    # Create a directory for storing tfrecord files
    file_io.make_directory(tfrecord_dir, clear_dir=True)

    #Convert input ranklib file to dataframe
    df = pd.concat([
        ranklib_helper.convert(f, keep_additional_info, gl_2_clicks,
                               non_zero_features_only,
                               feature_config.get_query_key()['name'],
                               feature_config.get_label()['name'])
        for f in ranklib_files
    ])

    #Write tfrecord files
    tfrecord_writer.write_from_df(df=df,
                                  tfrecord_file=os.path.join(
                                      tfrecord_dir, TFRECORD_FILE),
                                  feature_config=feature_config,
                                  tfrecord_type=tfrecord_type,
                                  logger=logger)

    dataset = tfrecord_reader.read(
        data_dir=tfrecord_dir,
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        max_sequence_size=max_sequence_size,
        batch_size=batch_size,
        preprocessing_keys_to_fns=preprocessing_keys_to_fns,
        parse_tfrecord=parse_tfrecord,
        file_io=file_io,
        logger=logger,
    )

    return dataset
Beispiel #10
0
def read(data_dir: str,
         feature_config: FeatureConfig,
         tfrecord_type: str,
         tfrecord_dir: str,
         file_io: FileIO,
         batch_size: int = 128,
         preprocessing_keys_to_fns: dict = {},
         use_part_files: bool = False,
         max_sequence_size: int = 25,
         parse_tfrecord: bool = True,
         logger=None,
         **kwargs) -> tf.data.TFRecordDataset:
    """
    - reads csv-formatted data from an input directory
    - selects relevant features
    - creates Dataset X and y

    Current execution plan:
        1. Load CSVs as pandas dataframes
        2. Convert each query into tf.train.SequenceExample protobufs
        3. Write the protobufs into a .tfrecord file
        4. Load .tfrecord file into a TFRecordDataset and parse the protobufs

    Args:
        - data_dir: Path to directory containing csv files to read
        - feature_config: ml4ir.config.features.FeatureConfig object extracted from the feature config
        - tfrecord_dir: Path to directory where the serialized .tfrecord files will be stored
        - batch_size: int value specifying the size of the batch
        - use_part_files: bool value specifying whether to look for part files
        - max_sequence_size: int value specifying max number of records per query
        - logger: logging object

    Returns:
        tensorflow TFRecordDataset
    """
    csv_files: List[str] = file_io.get_files_in_directory(
        data_dir,
        extension="" if use_part_files else ".csv",
        prefix="part-" if use_part_files else "",
    )

    # Create a directory for storing tfrecord files
    file_io.make_directory(tfrecord_dir, clear_dir=True)

    # Write tfrecord files
    tfrecord_writer.write_from_files(
        csv_files=csv_files,
        tfrecord_file=os.path.join(tfrecord_dir, TFRECORD_FILE),
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        file_io=file_io,
        logger=logger,
    )

    dataset = tfrecord_reader.read(
        data_dir=tfrecord_dir,
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        max_sequence_size=max_sequence_size,
        batch_size=batch_size,
        preprocessing_keys_to_fns=preprocessing_keys_to_fns,
        parse_tfrecord=parse_tfrecord,
        file_io=file_io,
        logger=logger,
    )

    return dataset