def get_feature_meta(column, preprocessing_parameters, backend): if column.dtype != object: return {} distinct_values = backend.df_engine.compute(column.drop_duplicates()) if len(distinct_values) > 2: raise ValueError( f"Binary feature column {column.name} expects 2 distinct values, " f"found: {distinct_values.values.tolist()}" ) if "fallback_true_label" in preprocessing_parameters: fallback_true_label = preprocessing_parameters["fallback_true_label"] else: fallback_true_label = sorted(distinct_values)[0] logger.warning( f"In case binary feature {column.name} doesn't have conventional boolean values, " f"we will interpret {fallback_true_label} as 1 and the other values as 0. " f"If this is incorrect, please use the category feature type or " f"manually specify the true value with `preprocessing.fallback_true_label`." ) str2bool = {v: strings_utils.str2bool(v, fallback_true_label) for v in distinct_values} bool2str = [k for k, v in sorted(str2bool.items(), key=lambda item: item[1])] return {"str2bool": str2bool, "bool2str": bool2str, "fallback_true_label": fallback_true_label}
def test_str_to_bool(): # Global bool mappings are used. assert strings_utils.str2bool("True") assert strings_utils.str2bool("true") assert not strings_utils.str2bool("0") # Error raised if non-mapped value is encountered and no fallback is specified. with pytest.raises(Exception): strings_utils.str2bool("bot") # Fallback label is used. assert strings_utils.str2bool("bot", fallback_true_label="bot") assert not strings_utils.str2bool("human", fallback_true_label="bot") assert strings_utils.str2bool("human", fallback_true_label="human") assert not strings_utils.str2bool("human", fallback_true_label="Human") # Fallback label is used, strictly as a fallback. assert strings_utils.str2bool("True", fallback_true_label="False")
def get_feature_meta(column, preprocessing_parameters, backend): if column.dtype != object: return {} distinct_values = backend.df_engine.compute(column.drop_duplicates()) if len(distinct_values) > 2: raise ValueError( f'Binary feature column {column.name} expects 2 distinct values, ' f'found: {distinct_values.values.tolist()}' ) str2bool = {v: strings_utils.str2bool(v) for v in distinct_values} bool2str = [k for k, v in sorted(str2bool.items(), key=lambda item: item[1])] return { 'str2bool': str2bool, 'bool2str': bool2str, }
def t_neuropod(csv_filename): ####### # Setup ####### dir_path = os.path.dirname(csv_filename) image_dest_folder = os.path.join(os.getcwd(), 'generated_images') audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio') input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder), timeseries_feature(), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature() ] # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename) ############# # Train model ############# model_definition = { 'input_features': input_features, 'output_features': output_features, 'training': { 'epochs': 2 } } ludwig_model = LudwigModel(model_definition) ludwig_model.train( data_csv=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_model=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, ) original_predictions_df = ludwig_model.predict(data_csv=data_csv_path) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel') shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################ # build neuropod ################ neuropod_path = os.path.join(dir_path, 'neuropod') export_neuropod(ludwigmodel_path, neuropod_path=neuropod_path) ######################## # predict using neuropod ######################## data_df = pd.read_csv(data_csv_path) if_dict = { input_feature['name']: np.expand_dims( np.array([str(x) for x in data_df[input_feature['name']].tolist()], dtype='str'), 1) for input_feature in input_features } from neuropod.loader import load_neuropod neuropod_model = load_neuropod(neuropod_path) preds = neuropod_model.infer(if_dict) for key in preds: preds[key] = np.squeeze(preds[key]) ######### # cleanup ######### # Delete the temporary data created for path in [ ludwigmodel_path, neuropod_path, image_dest_folder, audio_dest_folder ]: if os.path.exists(path): if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path, ignore_errors=True) ######## # checks ######## for output_feature in output_features: output_feature_name = output_feature['name'] output_feature_type = output_feature['type'] if (output_feature_name + "_predictions" in preds and output_feature_name + "_predictions" in original_predictions_df): neuropod_pred = preds[output_feature_name + "_predictions"].tolist() if output_feature_type == BINARY: neuropod_pred = list(map(lambda x: str2bool(x), neuropod_pred)) if output_feature_type in {SEQUENCE, TEXT, SET}: neuropod_pred = list(map(lambda x: x.split(), neuropod_pred)) original_pred = original_predictions_df[output_feature_name + "_predictions"].tolist() assert neuropod_pred == original_pred if (output_feature_name + "_probability" in preds and output_feature_name + "_probability" in original_predictions_df): neuropod_prob = preds[output_feature_name + "_probability"].tolist() if output_feature_type in {SEQUENCE, TEXT, SET}: neuropod_prob = list( map(lambda x: [float(n) for n in x.split()], neuropod_prob)) if any(isinstance(el, list) for el in neuropod_prob): neuropod_prob = np.array( list(itertools.zip_longest(*neuropod_prob, fillvalue=0))).T original_prob = original_predictions_df[output_feature_name + "_probability"].tolist() if any(isinstance(el, list) for el in original_prob): original_prob = np.array( list(itertools.zip_longest(*original_prob, fillvalue=0))).T assert np.isclose(neuropod_prob, original_prob).all() if (output_feature_name + "_probabilities" in preds and output_feature_name + "_probabilities" in original_predictions_df): neuropod_prob = preds[output_feature_name + "_probabilities"].tolist() original_prob = original_predictions_df[output_feature_name + "_probabilities"].tolist() assert np.isclose(neuropod_prob, original_prob).all()
def test_neuropod(csv_filename): ####### # Setup ####### with tempfile.TemporaryDirectory() as tmpdir: dir_path = tmpdir data_csv_path = os.path.join(tmpdir, csv_filename) image_dest_folder = os.path.join(tmpdir, "generated_images") audio_dest_folder = os.path.join(tmpdir, "generated_audio") input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder), timeseries_feature(), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature(), ] # Generate test data data_csv_path = generate_data(input_features, output_features, data_csv_path) ############# # Train model ############# config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}} ludwig_model = LudwigModel(config, backend=LocalTestBackend()) ludwig_model.train( dataset=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, output_directory=dir_path, ) data_df = pd.read_csv(data_csv_path) original_predictions_df, _ = ludwig_model.predict(dataset=data_df) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, "ludwigmodel") shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################ # build neuropod ################ neuropod_path = os.path.join(dir_path, "neuropod") shutil.rmtree(neuropod_path, ignore_errors=True) export_neuropod(ludwigmodel_path, neuropod_path=neuropod_path, entrypoint="get_test_model") ######################## # predict using neuropod ######################## if_dict = { input_feature["name"]: np.expand_dims( np.array([str(x) for x in data_df[input_feature["name"]].tolist()], dtype="str"), 1 ) for input_feature in input_features } from neuropod.loader import load_neuropod neuropod_model = load_neuropod(neuropod_path, _always_use_native=False) preds = neuropod_model.infer(if_dict) for key in preds: preds[key] = np.squeeze(preds[key]) ######### # cleanup ######### # Delete the temporary data created for path in [ludwigmodel_path, neuropod_path, image_dest_folder, audio_dest_folder]: if os.path.exists(path): if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path, ignore_errors=True) ######## # checks ######## for output_feature in output_features: output_feature_name = output_feature["name"] output_feature_type = output_feature["type"] if ( output_feature_name + "_predictions" in preds and output_feature_name + "_predictions" in original_predictions_df ): neuropod_pred = preds[output_feature_name + "_predictions"].tolist() if output_feature_type == BINARY: neuropod_pred = [str2bool(x) for x in neuropod_pred] if output_feature_type in {SEQUENCE, TEXT, SET}: neuropod_pred = [x.split() for x in neuropod_pred] original_pred = original_predictions_df[output_feature_name + "_predictions"].tolist() assert neuropod_pred == original_pred if ( output_feature_name + "_probability" in preds and output_feature_name + "_probability" in original_predictions_df ): neuropod_prob = preds[output_feature_name + "_probability"].tolist() if output_feature_type in {SEQUENCE, TEXT, SET}: neuropod_prob = [[float(n) for n in x.split()] for x in neuropod_prob] if any(isinstance(el, list) for el in neuropod_prob): neuropod_prob = np.array(list(itertools.zip_longest(*neuropod_prob, fillvalue=0))).T original_prob = original_predictions_df[output_feature_name + "_probability"].tolist() if any(isinstance(el, list) for el in original_prob): original_prob = np.array(list(itertools.zip_longest(*original_prob, fillvalue=0))).T assert np.allclose(neuropod_prob, original_prob) if ( output_feature_name + "_probabilities" in preds and output_feature_name + "_probabilities" in original_predictions_df ): neuropod_prob = preds[output_feature_name + "_probabilities"].tolist() original_prob = original_predictions_df[output_feature_name + "_probabilities"].tolist() assert np.allclose(neuropod_prob, original_prob)
def test_neuropod(csv_filename): ####### # Setup ####### dir_path = os.path.dirname(csv_filename) output_feature_options = [] # Single sequence input, multiple outputs sf = sequence_feature() input_features = [sf] input_feature_name = input_features[0]['name'] output_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature() ] # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename) ############# # Train model ############# model_definition = { 'input_features': input_features, 'output_features': output_features, 'training': { 'epochs': 2 } } ludwig_model = LudwigModel(model_definition) ludwig_model.train( data_csv=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_model=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, ) original_predictions_df = ludwig_model.predict(data_csv=data_csv_path) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel') shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################ # build neuropod ################ neuropod_path = os.path.join(dir_path, 'neuropod') export_neuropod(ludwigmodel_path, neuropod_path=neuropod_path) ######################## # predict using neuropod ######################## data_df = pd.read_csv(data_csv_path) if_vals = data_df[input_feature_name].tolist() from neuropod.loader import load_neuropod neuropod_model = load_neuropod(neuropod_path) preds = neuropod_model.infer( {input_feature_name: np.array(if_vals, dtype='str')}) ######### # cleanup ######### for path in [ludwigmodel_path, neuropod_path]: if os.path.exists(path): if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path, ignore_errors=True) ######## # checks ######## for output_feature in output_features: output_feature_name = output_feature['name'] output_feature_type = output_feature['type'] if (output_feature_name + "_predictions" in preds and output_feature_name + "_predictions" in original_predictions_df): neuropod_pred = preds[output_feature_name + "_predictions"].tolist() if output_feature_type == BINARY: neuropod_pred = list(map(lambda x: str2bool(x), neuropod_pred)) if output_feature_type in {SEQUENCE, TEXT, SET}: neuropod_pred = list(map(lambda x: x.split(), neuropod_pred)) original_pred = original_predictions_df[output_feature_name + "_predictions"].tolist() assert neuropod_pred == original_pred if (output_feature_name + "_probability" in preds and output_feature_name + "_probability" in original_predictions_df): neuropod_prob = preds[output_feature_name + "_probability"].tolist() if output_feature_type in {SEQUENCE, TEXT, SET}: neuropod_prob = list( map(lambda x: [float(n) for n in x.split()], neuropod_prob)) if any(isinstance(el, list) for el in neuropod_prob): neuropod_prob = np.array( list(itertools.zip_longest(*neuropod_prob, fillvalue=0))).T original_prob = original_predictions_df[output_feature_name + "_probability"].tolist() if any(isinstance(el, list) for el in original_prob): original_prob = np.array( list(itertools.zip_longest(*original_prob, fillvalue=0))).T assert np.isclose(neuropod_prob, original_prob).all() if (output_feature_name + "_probabilities" in preds and output_feature_name + "_probabilities" in original_predictions_df): neuropod_prob = preds[output_feature_name + "_probabilities"].tolist() original_prob = original_predictions_df[output_feature_name + "_probabilities"].tolist() assert np.isclose(neuropod_prob, original_prob).all()