Beispiel #1
0
def test_train_product_classifier_from_embeddings():
    runner = CliRunner()
    output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH)

    embeddings_path = common.get_full_path(tests_path,
                                           common.TRANSFORMED_EMBEDDINGS_PATH)
    # we are going to assume the working directory already has a embeddings model trained

    params = "from_embeddings=True,label_col=categories,doc_col=document,classes=10,test_size=0.3," \
             "lr=0.01,epochs=10,vec_size=300"
    result = runner.invoke(cli.wtsp, [
        '--work-dir', output_path, "train", "products", "--model",
        "classifier", "--params", params, embeddings_path
    ])
    assert result.exit_code == 0
    # validate the existence of the output directory
    result_dir = f"{output_path}/products/models/classifier"
    assert os.path.exists(result_dir)
    # and the content
    assert os.path.exists(f"{result_dir}/category_encoder.model")
    assert os.path.exists(f"{result_dir}/prod_classifier-def.yaml")
    assert os.path.exists(f"{result_dir}/prod_classifier-weights.h5")
    assert os.path.exists(f"{result_dir}/training_history.png")
    assert os.path.exists(f"{result_dir}/classification_report.png")
    common.delete_path(output_path)
Beispiel #2
0
def test_train_product_embeddings():
    runner = CliRunner()
    input_data = common.get_full_path(tests_path, common.RAW_PRODUCTS_PATH)
    output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH)
    params = "label_col=categories,doc_col=document,lr=0.0002,epochs=10,vec_size=300,alpha=0.025,min_alpha=0.00025,min_count=1"
    result = runner.invoke(cli.wtsp, [
        '--work-dir', output_path, "train", "products", "--model",
        "embeddings", "--params", params, input_data
    ])
    assert result.exit_code == 0
    # validate the existence of the output directory
    result_dir = f"{output_path}/products/models/embeddings"
    assert os.path.exists(result_dir)
    # and the content
    assert os.path.exists(f"{result_dir}/d2v_model.model")
    common.delete_path(output_path)
Beispiel #3
0
def test_describe_tweets():
    runner = CliRunner()
    input_data = common.get_full_path(tests_path, common.RAW_TWEETS_PATH)
    output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH)
    result = runner.invoke(cli.describe, [
        'tweets', "--filters", "country_code=US", "--output-dir", output_path,
        "--min-count", 10, input_data
    ])
    assert result.exit_code == 0
    # validate the existence of the output directory
    tweets_describe_result = f"{output_path}/tweets/country_code=US"
    assert os.path.exists(tweets_describe_result)
    # and the content
    assert os.path.exists(f"{tweets_describe_result}/counts.csv")
    assert os.path.exists(f"{tweets_describe_result}/bar_chart.png")
    common.delete_path(tweets_describe_result)
Beispiel #4
0
def test_describe_products_with_explode():
    runner = CliRunner()
    input_data = common.get_full_path(tests_path, common.RAW_PRODUCTS_PATH)
    output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH)
    result = runner.invoke(cli.describe, [
        'products', "--output-dir", output_path, "--groupby", "categories",
        "--min-count", 10, "--explode", input_data
    ])
    assert result.exit_code == 0
    # validate the existence of the output directory
    products_describe_result = f"{output_path}/documents"
    assert os.path.exists(products_describe_result)
    # and the content
    assert os.path.exists(f"{products_describe_result}/counts.csv")
    assert os.path.exists(f"{products_describe_result}/bar_chart.png")
    common.delete_path(products_describe_result)
Beispiel #5
0
def test_train_tweets_n_neighbors():
    runner = CliRunner()
    input_data = common.get_full_path(tests_path, common.RAW_TWEETS_PATH)
    output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH)
    result = runner.invoke(cli.train_tweets, [
        '--model', "nearest-neighbors", "--filters",
        "place_name='Los Angeles'", "--params",
        "n_neighbors=10,location_column=location_geometry", "--output-dir",
        output_path, input_data
    ])
    assert result.exit_code == 0
    # validate the existence of the output directory
    result_dir = f"{output_path}/tweets/place_name=Los Angeles"
    assert os.path.exists(result_dir)
    # and the content
    assert os.path.exists(f"{result_dir}/nearest_neighbors.png")
    assert os.path.exists(f"{result_dir}/scatter_plot.png")
    common.delete_path(output_path)
Beispiel #6
0
def test_transform_embeddings():
    runner = CliRunner()
    input_path = common.get_full_path(tests_path, common.RAW_PRODUCTS_PATH)
    output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH)
    models_path = f"{output_path}/products/models/"

    # we are going to assume the working directory already has a embeddings model trained
    model_assets_path = common.get_full_path(tests_path, common.ASSETS_PATH)
    copy_folder_recursively(f"{model_assets_path}/products", models_path)

    result = runner.invoke(
        cli.wtsp,
        ["--work-dir", output_path, "predict", "embeddings", input_path])

    assert result.exit_code == 0
    # validate the existence of the output files
    result_embeddings = f"{output_path}/embeddings/document_embeddings.npz"
    assert os.path.exists(result_embeddings)
    result_cat_encoder = f"{output_path}/embeddings/category_encoder.save"
    assert os.path.exists(result_cat_encoder)
    common.delete_path(output_path)
Beispiel #7
0
def test_transform_where_to_sell_products():
    runner = CliRunner()
    input_data = common.get_full_path(tests_path, common.RAW_TWEETS_PATH)
    output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH)

    models_path = f"{output_path}/products/models/"
    # we are going to assume the working directory already has a embeddings model trained
    model_assets_path = common.get_full_path(tests_path, common.ASSETS_PATH)
    copy_folder_recursively(f"{model_assets_path}/products", models_path)

    params = "center='34;-118',eps=0.04,n_neighbors=2,location_column=location_geometry,min_score=0.1"
    result = runner.invoke(cli.wtsp, [
        "--work-dir", output_path, "predict", "clusters", "--filters",
        "place_name='Los Angeles'", "--params", params, input_data
    ])
    assert result.exit_code == 0
    # validate the existence of the output directory
    result_dir = f"{output_path}/where_to_sell_in/place_name=Los Angeles"
    assert os.path.exists(result_dir)
    # and the content
    assert os.path.exists(f"{result_dir}/classified_clusters.csv")
    assert os.path.exists(f"{result_dir}/classified_clusters.html")
    common.delete_path(output_path)