コード例 #1
0
def read(data_dir: str,
         feature_config: FeatureConfig,
         tfrecord_type: str,
         file_io: FileIO,
         max_sequence_size: int = 0,
         batch_size: int = 0,
         preprocessing_keys_to_fns: dict = {},
         parse_tfrecord: bool = True,
         use_part_files: bool = False,
         logger: Logger = None,
         **kwargs) -> data.TFRecordDataset:
    """
    - reads tfrecord data from an input directory
    - selects relevant features
    - creates X and y data

    Args:
        data_dir: Path to directory containing csv files to read
        feature_config: ml4ir.config.features.Features object extracted from the feature config
        tfrecord_type: either example or sequence_example
        batch_size: int value specifying the size of the batch
        preprocessing_keys_to_fns: dictionary mapping preprocessing keys in the feature_config to functions
        parse_tfrecord: whether to parse SequenceExamples into features
        logger: logging object

    Returns:
        tensorflow dataset
    """
    parse_fn = get_parse_fn(
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        preprocessing_keys_to_fns=preprocessing_keys_to_fns,
        max_sequence_size=max_sequence_size,
    )

    # Get all tfrecord files in directory
    tfrecord_files = file_io.get_files_in_directory(
        data_dir,
        extension="" if use_part_files else ".tfrecord",
        prefix="part-" if use_part_files else "",
    )

    # Parse the protobuf data to create a TFRecordDataset
    dataset = data.TFRecordDataset(tfrecord_files)
    if parse_tfrecord:
        dataset = dataset.map(parse_fn).apply(
            data.experimental.ignore_errors())

    # Create BatchedDataSet
    if batch_size:
        dataset = dataset.batch(batch_size, drop_remainder=True)

    if logger:
        logger.info(
            "Created TFRecordDataset from SequenceExample protobufs from {} files : {}"
            .format(len(tfrecord_files),
                    str(tfrecord_files)[:50]))

    return dataset
コード例 #2
0
def read(
    data_dir: str,
    feature_config: FeatureConfig,
    tfrecord_type: str,
    file_io: FileIO,
    max_sequence_size: int = 0,
    batch_size: int = 0,
    preprocessing_keys_to_fns: dict = {},
    parse_tfrecord: bool = True,
    use_part_files: bool = False,
    logger: Logger = None,
    **kwargs
) -> data.TFRecordDataset:
    """
    Extract features by reading and parsing TFRecord data
    and converting into a TFRecordDataset using the FeatureConfig

    Parameters
    ----------
    data_dir : str
        path to the directory containing train, validation and test data
    feature_config : `FeatureConfig` object
        FeatureConfig object that defines the features to be loaded in the dataset
        and the preprocessing functions to be applied to each of them
    tfrecord_type : {"example", "sequence_example"}
        Type of the TFRecord protobuf message to be used for TFRecordDataset
    file_io : `FileIO` object
        file I/O handler objects for reading and writing data
    max_sequence_size : int, optional
        maximum number of sequence to be used with a single SequenceExample proto message
        The data will be appropriately padded or clipped to fit the max value specified
    batch_size : int, optional
        size of each data batch
    preprocessing_keys_to_fns : dict of (str, function), optional
        dictionary of function names mapped to function definitions
        that can now be used for preprocessing while loading the
        TFRecordDataset to create the RelevanceDataset object
    use_part_files : bool, optional
        load dataset from part files checked using "part-" prefix
    parse_tfrecord : bool, optional
        parse the TFRecord string from the dataset;
        returns strings as is otherwise
    logger : `Logger`, optional
        logging handler for status messages

    Returns
    -------
    `TFRecordDataset`
        TFRecordDataset loaded from the `data_dir` specified using the FeatureConfig
    """
    parse_fn = get_parse_fn(
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        preprocessing_keys_to_fns=preprocessing_keys_to_fns,
        max_sequence_size=max_sequence_size,
    )

    # Get all tfrecord files in directory
    tfrecord_files = file_io.get_files_in_directory(
        data_dir,
        extension="" if use_part_files else ".tfrecord",
        prefix="part-" if use_part_files else "",
    )

    # Parse the protobuf data to create a TFRecordDataset
    dataset = data.TFRecordDataset(tfrecord_files)

    if parse_tfrecord:
        # Parallel calls set to AUTOTUNE: improved training performance by 40% with a classification model
        dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE).apply(
            data.experimental.ignore_errors()
        )

    # Create BatchedDataSet
    if batch_size:
        dataset = dataset.batch(batch_size, drop_remainder=True)

    if logger:
        logger.info(
            "Created TFRecordDataset from SequenceExample protobufs from {} files : {}".format(
                len(tfrecord_files), str(tfrecord_files)[:50]
            )
        )

    # We apply prefetch as it improved train/test/validation throughput by 30% in some real model training.
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return dataset
コード例 #3
0
ファイル: csv_reader.py プロジェクト: sureshannapureddy/ml4ir
def read(data_dir: str,
         feature_config: FeatureConfig,
         tfrecord_type: str,
         tfrecord_dir: str,
         file_io: FileIO,
         batch_size: int = 128,
         preprocessing_keys_to_fns: dict = {},
         use_part_files: bool = False,
         max_sequence_size: int = 25,
         parse_tfrecord: bool = True,
         logger=None,
         **kwargs) -> tf.data.TFRecordDataset:
    """
    Create a TFRecordDataset from directory of CSV files using the FeatureConfig

    Current execution plan:
        1. Load CSVs as pandas dataframes
        2. Convert each query into tf.train.SequenceExample protobufs
        3. Write the protobufs into a .tfrecord file
        4. Load .tfrecord file into a TFRecordDataset and parse the protobufs

    Parameters
    data_dir : str
        Path to directory containing csv files to read
    feature_config : FeatureConfig object
        FeatureConfig object that defines the features to be loaded in the dataset
        and the preprocessing functions to be applied to each of them
    tfrecord_dir : str
        Path to directory where the serialized .tfrecord files will be stored
    batch_size : int
        value specifying the size of the data batch
    use_part_files : bool
        load dataset from part files checked using "part-" prefix
    max_sequence_size : int
        value specifying max number of records per query
    logger : Logger object
        logging handler to print and save status messages

    Returns
    -------
    `TFRecordDataset` object
        tensorflow TFRecordDataset loaded from the CSV file
    """
    csv_files: List[str] = file_io.get_files_in_directory(
        data_dir,
        extension="" if use_part_files else ".csv",
        prefix="part-" if use_part_files else "",
    )

    # Create a directory for storing tfrecord files
    file_io.make_directory(tfrecord_dir, clear_dir=True)

    # Write tfrecord files
    tfrecord_writer.write_from_files(
        csv_files=csv_files,
        tfrecord_file=os.path.join(tfrecord_dir, TFRECORD_FILE),
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        file_io=file_io,
        logger=logger,
    )

    dataset = tfrecord_reader.read(
        data_dir=tfrecord_dir,
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        max_sequence_size=max_sequence_size,
        batch_size=batch_size,
        preprocessing_keys_to_fns=preprocessing_keys_to_fns,
        parse_tfrecord=parse_tfrecord,
        file_io=file_io,
        logger=logger,
    )

    return dataset
コード例 #4
0
def read(data_dir: str,
         feature_config: FeatureConfig,
         tfrecord_type: str,
         tfrecord_dir: str,
         file_io: FileIO,
         batch_size: int = 128,
         preprocessing_keys_to_fns: dict = {},
         use_part_files: bool = False,
         max_sequence_size: int = 25,
         parse_tfrecord: bool = True,
         logger=None,
         keep_additional_info=0,
         non_zero_features_only=1,
         **kwargs) -> tf.data.TFRecordDataset:
    """
    - reads ranklib-formatted data from an input directory
    - selects relevant features
    - creates Dataset X and y

    Current execution plan:
        1. Convert ranklib to a dataframe
        2. Convert each query into tf.train.SequenceExample protobufs
        3. Write the protobufs into a .tfrecord file
        4. Load .tfrecord file into a TFRecordDataset and parse the protobufs

        Parameters
        ----------
        data_dir: str
            Path to directory containing csv files to read
        feature_config: ml4ir.config.features.FeatureConfig object
            FeatureConfig object extracted from the feature config
        tfrecord_dir: str
            Path to directory where the serialized .tfrecord files will be stored
        batch_size: int
            Value specifying the size of the batch
        use_part_files: bool
            Value specifying whether to look for part files
        max_sequence_size: int
            Value specifying max number of records per query
        logger: logging object
            logging object
        keep_additional_info: int
            Option to keep additional info (All info after the "#") 1 to keep, 0 to ignore
        non_zero_features_only: int
            Only non zero features are stored. 1 for yes, 0 otherwise

        Returns
        -------
        tensorflow TFRecordDataset
            Processed dataset
    """
    ranklib_files: List[str] = file_io.get_files_in_directory(
        data_dir,
        extension="" if use_part_files else ".txt",
        prefix="part-" if use_part_files else "",
    )

    gl_2_clicks = False

    # Create a directory for storing tfrecord files
    file_io.make_directory(tfrecord_dir, clear_dir=True)

    #Convert input ranklib file to dataframe
    df = pd.concat([
        ranklib_helper.convert(f, keep_additional_info, gl_2_clicks,
                               non_zero_features_only,
                               feature_config.get_query_key()['name'],
                               feature_config.get_label()['name'])
        for f in ranklib_files
    ])

    #Write tfrecord files
    tfrecord_writer.write_from_df(df=df,
                                  tfrecord_file=os.path.join(
                                      tfrecord_dir, TFRECORD_FILE),
                                  feature_config=feature_config,
                                  tfrecord_type=tfrecord_type,
                                  logger=logger)

    dataset = tfrecord_reader.read(
        data_dir=tfrecord_dir,
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        max_sequence_size=max_sequence_size,
        batch_size=batch_size,
        preprocessing_keys_to_fns=preprocessing_keys_to_fns,
        parse_tfrecord=parse_tfrecord,
        file_io=file_io,
        logger=logger,
    )

    return dataset
コード例 #5
0
def read(data_dir: str,
         feature_config: FeatureConfig,
         tfrecord_type: str,
         tfrecord_dir: str,
         file_io: FileIO,
         batch_size: int = 128,
         preprocessing_keys_to_fns: dict = {},
         use_part_files: bool = False,
         max_sequence_size: int = 25,
         parse_tfrecord: bool = True,
         logger=None,
         **kwargs) -> tf.data.TFRecordDataset:
    """
    - reads csv-formatted data from an input directory
    - selects relevant features
    - creates Dataset X and y

    Current execution plan:
        1. Load CSVs as pandas dataframes
        2. Convert each query into tf.train.SequenceExample protobufs
        3. Write the protobufs into a .tfrecord file
        4. Load .tfrecord file into a TFRecordDataset and parse the protobufs

    Args:
        - data_dir: Path to directory containing csv files to read
        - feature_config: ml4ir.config.features.FeatureConfig object extracted from the feature config
        - tfrecord_dir: Path to directory where the serialized .tfrecord files will be stored
        - batch_size: int value specifying the size of the batch
        - use_part_files: bool value specifying whether to look for part files
        - max_sequence_size: int value specifying max number of records per query
        - logger: logging object

    Returns:
        tensorflow TFRecordDataset
    """
    csv_files: List[str] = file_io.get_files_in_directory(
        data_dir,
        extension="" if use_part_files else ".csv",
        prefix="part-" if use_part_files else "",
    )

    # Create a directory for storing tfrecord files
    file_io.make_directory(tfrecord_dir, clear_dir=True)

    # Write tfrecord files
    tfrecord_writer.write_from_files(
        csv_files=csv_files,
        tfrecord_file=os.path.join(tfrecord_dir, TFRECORD_FILE),
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        file_io=file_io,
        logger=logger,
    )

    dataset = tfrecord_reader.read(
        data_dir=tfrecord_dir,
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        max_sequence_size=max_sequence_size,
        batch_size=batch_size,
        preprocessing_keys_to_fns=preprocessing_keys_to_fns,
        parse_tfrecord=parse_tfrecord,
        file_io=file_io,
        logger=logger,
    )

    return dataset