Exemple #1
0
def write_from_files(
    csv_files: List[str],
    tfrecord_file: str,
    feature_config: FeatureConfig,
    tfrecord_type: str,
    file_io: FileIO,
    logger: Logger = None,
):
    """
    Converts data from CSV files into tfrecord data.
    Output data protobuf format -> train.SequenceExample

    Args:
        csv_files: list of csv file paths to read data from
        tfrecord_file: tfrecord file path to write the output
        feature_config: str path to YAML feature config or str YAML feature config
        tfrecord_type: TFRecordTypeKey.EXAMPLE or TFRecordTypeKey.SEQUENCE_EXAMPLE
        logger: logging object

    NOTE: This method should be moved out of ml4ir and into the preprocessing pipeline
    """

    # Read CSV data into a pandas dataframe
    df = file_io.read_df_list(csv_files)
    write_from_df(df, tfrecord_file, feature_config, tfrecord_type, logger)
def write_from_files(
    csv_files: List[str],
    tfrecord_file: str,
    feature_config: FeatureConfig,
    tfrecord_type: str,
    file_io: FileIO,
    logger: Logger = None,
):
    """
    Converts data from CSV files into tfrecord files

    Parameters
    ----------
    csv_files : list of str
        list of csv file paths to read data from
    tfrecord_file : str
        tfrecord file path to write the output
    feature_config : `FeatureConfig`
        FeatureConfig object that defines the features to be loaded in the dataset
        and the preprocessing functions to be applied to each of them
    tfrecord_type : {"example", "sequence_example"}
        Type of the TFRecord protobuf message to be used for TFRecordDataset
    logger : `Logger`, optional
        logging handler for status messages
    """

    # Read CSV data into a pandas dataframe
    df = file_io.read_df_list(csv_files)
    write_from_df(df, tfrecord_file, feature_config, tfrecord_type, logger)