Ejemplo n.º 1
0
def main():
    args = docopt(__doc__, version=__version__, options_first=True)

    setup_logging()

    if args["<command>"] == "node":
        src.cli.node.main()
Ejemplo n.º 2
0
def test_query_model():
    setup_logging(logging.DEBUG)
    logging.debug("\ntest_query_model")

    path = "integrationtests"
    test_model_path = f"{path}/models/trained.model"

    logging.debug(f"test_model_path: {test_model_path}")
    logging.debug("Creating instance of QueryModel")

    query_model = QueryModel(test_model_path, "legit")

    legit_examples = {"domain": ["richmondfc", "media-allrecipes", "reddit"]}

    legit_result = query_model.predict(legit_examples)

    logging.debug(f"legit_result: {legit_result}")

    try:
        assert legit_result
    except AssertionError:
        message = f"Didn't get expected prediction of all 'True' for: {legit_examples['domain']}"
        logging.exception(message)
        pytest.fail(message)

    dga_examples = {
        "domain":
        ["cgeoiyxoradbymu", "kbcejbpbduxyxrcqzxlxwdwclrqk", "dgrnntdplbrtg"]
    }

    dga_result = query_model.predict(dga_examples)

    logging.debug(f"dga_result: {dga_result}")

    try:
        assert not dga_result
    except AssertionError:
        message = f"Didn't get expected prediction of all 'False' for: {dga_examples['domain']}"
        logging.exception(message)
        pytest.fail(message)

    mixed_examples = {
        "domain":
        ["cgeoiyxoradbymu", "shipspotting", "rweulvobduttpzkbxsenfj"]
    }

    mixed_result = query_model.predict(mixed_examples)

    logging.debug(f"mixed_result: {mixed_result}")

    try:
        assert not mixed_result
    except AssertionError:
        message = f"Didn't get expected prediction of `False` for {mixed_examples['domain']}"
        logging.exception(message)
        pytest.fail(message)
Ejemplo n.º 3
0
def test_model(filename="models/trained.model"):
    """
    Load the model specified and pass in some tricky examples

    :param filename:
    :return:
    """
    setup_logging(logging.INFO)

    logging.debug("test_model")
    logging.debug(f"filename: {filename}")

    loaded_model = load_model(filename)

    blind_test = [
        "google",
        "asx",
        "netflix",
        "stan",
        "youtube",
        "facebook",
        "bing",
        "duckduckgo",
        "kjhkhssf",
        "scikit-learn",
        generate_domain(1983, 7, 1),
        "reddit",
        "longestdomains",
        "zwpejkljhdpoqk",
        "pklwllpppqzibn",
        "stackoverflow",
        generate_domain(2019, 12, 17),
        generate_domain(1900, 3, 22),
    ]

    # create some permutations of date a long, long, time ago in a galaxy far, far, way
    test_gen = generate_domain(1983, 7, 1)
    for i in range(1, 6):
        val = test_gen[::i]
        if val:
            blind_test.append(test_gen[::i])
        for j in range(1, 6):
            val = test_gen[j:i]
            if val:
                blind_test.append(test_gen[j:i])

    logging.debug(blind_test)

    test_df = pd.DataFrame(blind_test, columns=["domain"])

    logging.debug("About to make predictions on test_df")

    results = loaded_model.predict(test_df.values)

    for i, result in enumerate(results):
        logging.info(f"{blind_test[i]}: {result}")
Ejemplo n.º 4
0
def test_rescale():
    """
    Verify our generated features rescale as we expect

    :return:
    """
    setup_logging(logging.DEBUG)
    logging.debug("\ntest_rescale")

    test_data_path = "integrationtests/test_feature_generation_expected.csv"
    expected_output_path = "integrationtests/test_rescale_expected.csv"

    logging.debug(f"test_data_path: {test_data_path}")
    logging.debug(f"expected_output_path: {expected_output_path}")
    logging.debug("Loading test data")

    in_data_df = pd.read_csv(test_data_path)

    pipeline = Pipeline([rescale()])

    logging.debug("Applying rescale pipeline")

    pipeline_output = pipeline.fit_transform(in_data_df)

    try:
        assert (isinstance(pipeline_output, np.ndarray))
    except AssertionError:
        message = f"`pipeline_output` was not of expected type `np.ndarray`, got: {type(pipeline_output)}"
        logging.exception(message)
        pytest.fail(message)

    with pytest.raises(AttributeError):
        pipeline["rescale"].get_feature_names()

    logging.debug("Converting pipeline output to DataFrame")

    result_df = pd.DataFrame(pipeline_output, columns=in_data_df.columns)

    logging.debug("Loading validation data")

    expected_df = pd.read_csv(expected_output_path)

    try:
        pd.testing.assert_frame_equal(result_df,
                                      expected_df,
                                      check_dtype=False)
    except AssertionError:
        message = "Rescaling Pipeline didn't produce expected results"
        logging.exception(message)
        pytest.fail(message)
Ejemplo n.º 5
0
def main_chart():
    setup_logging()
    data_count = 300
    data = get_history()[-data_count:]
    current = get_current_value()
    X = [d[0] for d in data]
    y = [d[1] for d in data]
    model = SVR_rbf(X, y)
    X.append(max(X) + 1)
    X = np.array(X).reshape(-1, 1)
    y_predicted_all = model.predict(X)
    y_predicted = y_predicted_all[-1]
    print("Current: {}".format(current))
    print("Predicted: {}".format(y_predicted))
    y.append(current)
    return render_template('line_chart.html',
                           labels=list(X),
                           values=y_predicted_all,
                           values_real=y,
                           legend1="Estimated",
                           legend2="Real",
                           estimated=y_predicted)
def test_feature_generation():
    """
    Verify that we generated the expected features after preprocessing

    :return:
    """
    setup_logging(logging.DEBUG)
    logging.debug("test_feature_generation")

    test_data_path = "integrationtests/test_preprocessing_expected.csv"
    expected_output_path = "integrationtests/test_feature_generation_expected.csv"

    logging.debug(f"test_data_path: {test_data_path}")
    logging.debug(f"expected_output_path: {expected_output_path}")
    logging.debug("Loading test data")

    in_data_df = pd.read_csv(test_data_path)

    pipeline = Pipeline([
        feature_generation(["domain"])
    ])

    logging.debug("Applying feature_generation pipeline")

    pipeline_output = pipeline.transform(in_data_df[["domain"]])

    try:
        assert(isinstance(pipeline_output, np.ndarray))
    except AssertionError:
        message = f"`pipeline_output` was not of expected type `np.ndarray`, got: {type(pipeline_output)}"
        logging.exception(message)
        pytest.fail(message)

    column_names = pipeline["feature_generation"].get_feature_names()

    logging.debug(f"column_names: {column_names}")

    expected_column_names = [
        'digit_ratio__domain_digit_ratio',
        'len__domain_len',
        'vowel_distance_mode_ratio__domain_vowel_distance_mode_ratio',
        'vowel_distance_std_ratio__domain_vowel_distance_std_ratio',
        'vowel_ratio__domain_vowel_ratio',
        'consonants_variety_ratio__domain_consonants_variety_ratio',
        'character_pairs__domain_character_pair_12',
        'character_pairs__domain_character_pair_36',
        'character_pairs__domain_character_pair_al',
        'character_pairs__domain_character_pair_an',
        'character_pairs__domain_character_pair_ar',
        'character_pairs__domain_character_pair_ct',
        'character_pairs__domain_character_pair_di',
        'character_pairs__domain_character_pair_en',
        'character_pairs__domain_character_pair_er',
        'character_pairs__domain_character_pair_es',
        'character_pairs__domain_character_pair_ff',
        'character_pairs__domain_character_pair_ga',
        'character_pairs__domain_character_pair_gg',
        'character_pairs__domain_character_pair_he',
        'character_pairs__domain_character_pair_in',
        'character_pairs__domain_character_pair_jj',
        'character_pairs__domain_character_pair_le',
        'character_pairs__domain_character_pair_li',
        'character_pairs__domain_character_pair_ma',
        'character_pairs__domain_character_pair_me',
        'character_pairs__domain_character_pair_ne',
        'character_pairs__domain_character_pair_on',
        'character_pairs__domain_character_pair_oo',
        'character_pairs__domain_character_pair_or',
        'character_pairs__domain_character_pair_pv',
        'character_pairs__domain_character_pair_qq',
        'character_pairs__domain_character_pair_ra',
        'character_pairs__domain_character_pair_re',
        'character_pairs__domain_character_pair_ss',
        'character_pairs__domain_character_pair_st',
        'character_pairs__domain_character_pair_te',
        'character_pairs__domain_character_pair_ti',
        'character_pairs__domain_character_pair_to',
        'character_pairs__domain_character_pair_ve',
        'character_pairs__domain_character_pair_vv',
        'character_pairs__domain_character_pair_wc',
        'character_pairs__domain_character_pair_we',
    ]

    logging.debug(f"expected_column_names: {expected_column_names}")

    try:
        assert(column_names == expected_column_names)
    except AssertionError:
        message = "`pipeline['feature_generation'].get_feature_names()` did not return expected values"
        logging.exception(message)
        pytest.fail(message)

    result_df = pd.DataFrame(
        pipeline_output,
        columns=column_names
    )

    logging.debug("Loading validation DataFrame")

    expected_df = pd.read_csv(expected_output_path)

    try:
        pd.testing.assert_frame_equal(
            result_df, expected_df, check_dtype=False
        )
    except AssertionError:
        message = "Feature Generation pipeline did not produce expected results"
        logging.exception(message)
        pytest.fail(message)
import argparse
from copy import copy

from sanic import Sanic, response
from src.app_utils import generate_predictions, load, preprocess, validate_request
from src.logging import setup_logging

setup_logging()

app = Sanic(__name__)

MODEL_TYPE = None


@app.route("/predict", methods=["POST"])
def predict(request):
    resp = {"model_type": MODEL_TYPE, "predictions": []}

    request_json = request.json
    validate_request(request_json)

    sentences = request_json["sentences"]
    preprocessed_seqs, seqs_len, seqs_oov_pctgs = preprocess(sentences)
    probabilities, sentiments = generate_predictions(preprocessed_seqs)

    for sentence, prob, sent, ps, seq_len, seq_oov_pctg in zip(
            sentences,
            probabilities,
            sentiments,
            preprocessed_seqs,
            seqs_len,
def test_preprocessing_pipeline():
    """
    Verify we can read in and preprocess the data as expected.

    :return:
    """
    setup_logging(logging.DEBUG)

    logging.debug("test_preprocessing_pipeline")

    test_data_path = "integrationtests/test_data.csv"
    expected_output_path = "integrationtests/test_preprocessing_expected.csv"

    logging.debug(f"test_data_path: {test_data_path}")
    logging.debug(f"expected_output_path: {expected_output_path}")
    logging.debug("Loading test data")

    df = load_data(test_data_path, ["domain", "class"])

    pipeline = Pipeline([
        preprocess(),
    ])

    logging.debug("Applying pipeline transformations")

    pipeline_output = pipeline.transform(df)
    column_names = pipeline["preprocess"].get_feature_names()

    logging.debug("Pipeline transformation complete")
    logging.debug(f"column_names: {column_names}")

    try:
        assert (column_names == ['class', 'domain'])
    except AssertionError:
        message = f"Didn't get the expected `get_feature_names` from pipeline, got {column_names}"
        logging.exception(message)
        pytest.fail(message)

    try:
        assert (isinstance(pipeline_output, np.ndarray))
    except AssertionError:
        message = f"Didn't get expected type from pipeline, got {type(pipeline_output)}"
        logging.exception(message)
        pytest.fail(message)

    logging.debug(pipeline_output)
    logging.debug("Creating DataFrame from Pipeline output")

    result_df = pd.DataFrame(pipeline_output, columns=column_names)

    logging.debug("Applying `post_process_cleanup`")

    result_df = post_process_cleanup(result_df)

    logging.debug("Loading validation DataFrame")

    expected_df = pd.read_csv(expected_output_path)

    try:
        pd.testing.assert_frame_equal(result_df,
                                      expected_df,
                                      check_dtype=False)
    except AssertionError:
        message = "Data resulting from transformation did not match expected."
        logging.exception(message)
        pytest.fail(message)

    return pipeline
Ejemplo n.º 9
0
    ap.add_argument("-t", "--test_size", required=False,
                    help="Ratio of data to holdout for testing e.g. `0.3` for 30% - default `0.3`", default=0.3, type=float)
    ap.add_argument("-cv", "--cross_validation_folds", required=False,
                    help="Number of cross validation folds e.g. 10 for 10 Folds - default `5`", default=5, type=int)
    ap.add_argument("-r", "--random_state", required=False,
                    help="Specify the random state for reproducibility e.g. `42`", default=None, type=int)
    ap.add_argument("-v", "--verbose", required=False,
                    help="Specify verbosity of training process e.g. `0` for no training updates", default=1, type=int)

    args = vars(ap.parse_args())

    if not os.path.exists(args['output']):
        os.makedirs(args['output'])

    log_path = os.path.join(args['output'], "training.log")
    setup_logging(logging.DEBUG, file_name=log_path)

    try:
        logging.info(f"Logging training to: {log_path}")
    except FileNotFoundError:
        print(f"Unable to create log file at {log_path}, do you have the needed file system permissions?")
        sys.exit(1)

    train_model(
        args["path"],
        x_columns=args["input_columns"].split(","),
        y_column=args["target_column"],
        output_path=args["output"],
        test_size=args["test_size"],
        random_state=args["random_state"],
        cross_validation_folds=args["cross_validation_folds"],
def test_prepare_model_inputs():
    """
    Verify that our pipeline for creating model inputs works correctly. This pipeline is used both
    in training and prediction

    :return:
    """
    setup_logging(logging.DEBUG)
    logging.debug("test_prepare_model_inputs")

    test_data_path = "integrationtests/test_data.csv"

    logging.debug(f"test_data_path: {test_data_path}")
    logging.debug("Loading test data")

    in_data_df = pd.read_csv(test_data_path)
    print(in_data_df.head())

    logging.debug("Splitting into test and train sets by prepare_model_inputs")

    feature_names, X_train, X_test, y_train, y_test = prepare_model_inputs(
        in_data_df, ["domain"], "class", test_size=0.3, random_state_split=42)

    logging.debug(f"feature_names: {feature_names}")

    expected_feature_names = [
        'digit_ratio__domain_digit_ratio', 'len__domain_len',
        'vowel_distance_mode_ratio__domain_vowel_distance_mode_ratio',
        'vowel_distance_std_ratio__domain_vowel_distance_std_ratio',
        'vowel_ratio__domain_vowel_ratio',
        'consonants_variety_ratio__domain_consonants_variety_ratio',
        'character_pairs__domain_character_pair_12',
        'character_pairs__domain_character_pair_36',
        'character_pairs__domain_character_pair_al',
        'character_pairs__domain_character_pair_an',
        'character_pairs__domain_character_pair_ar',
        'character_pairs__domain_character_pair_ct',
        'character_pairs__domain_character_pair_di',
        'character_pairs__domain_character_pair_en',
        'character_pairs__domain_character_pair_er',
        'character_pairs__domain_character_pair_es',
        'character_pairs__domain_character_pair_ff',
        'character_pairs__domain_character_pair_ga',
        'character_pairs__domain_character_pair_gg',
        'character_pairs__domain_character_pair_he',
        'character_pairs__domain_character_pair_in',
        'character_pairs__domain_character_pair_jj',
        'character_pairs__domain_character_pair_le',
        'character_pairs__domain_character_pair_li',
        'character_pairs__domain_character_pair_ma',
        'character_pairs__domain_character_pair_me',
        'character_pairs__domain_character_pair_ne',
        'character_pairs__domain_character_pair_on',
        'character_pairs__domain_character_pair_oo',
        'character_pairs__domain_character_pair_or',
        'character_pairs__domain_character_pair_pv',
        'character_pairs__domain_character_pair_qq',
        'character_pairs__domain_character_pair_ra',
        'character_pairs__domain_character_pair_re',
        'character_pairs__domain_character_pair_ss',
        'character_pairs__domain_character_pair_st',
        'character_pairs__domain_character_pair_te',
        'character_pairs__domain_character_pair_ti',
        'character_pairs__domain_character_pair_to',
        'character_pairs__domain_character_pair_ve',
        'character_pairs__domain_character_pair_vv',
        'character_pairs__domain_character_pair_wc',
        'character_pairs__domain_character_pair_we'
    ]

    logging.debug(f"expected_feature_names: {expected_feature_names}")

    try:
        assert (feature_names == expected_feature_names)
    except AssertionError:
        message = "`prepare_model_inputs` did not return expected feature names"
        logging.exception(message)
        pytest.fail(message)

    testing_column = None
    try:
        for x in [X_train, X_test, y_train, y_test]:
            testing_column = x
            assert (isinstance(x, np.ndarray))
    except AssertionError:
        message = f"`prepare_model_inputs` didn't return expected types, got: {type(testing_column)}"
        logging.exception(message)
        pytest.fail(message)

    test_prepare_model_inputs_X_train = "integrationtests/test_prepare_model_inputs_X_train.csv"
    test_prepare_model_inputs_X_test = "integrationtests/test_prepare_model_inputs_X_test.csv"
    test_prepare_model_inputs_y_train = "integrationtests/test_prepare_model_inputs_y_train.csv"
    test_prepare_model_inputs_y_test = "integrationtests/test_prepare_model_inputs_y_test.csv"

    logging.debug("Loading validation split data sets")
    logging.debug(
        f"test_prepare_model_inputs_X_train: {test_prepare_model_inputs_X_train}"
    )
    logging.debug(
        f"test_prepare_model_inputs_X_test: {test_prepare_model_inputs_X_test}"
    )
    logging.debug(
        f"test_prepare_model_inputs_y_test: {test_prepare_model_inputs_y_test}"
    )

    expected_X_train = np.loadtxt(test_prepare_model_inputs_X_train,
                                  delimiter=',',
                                  dtype=np.float64)
    expected_X_test = np.loadtxt(test_prepare_model_inputs_X_test,
                                 delimiter=',',
                                 dtype=np.float64)
    expected_y_train = np.loadtxt(test_prepare_model_inputs_y_train,
                                  delimiter=',',
                                  dtype=str)
    expected_y_test = np.loadtxt(test_prepare_model_inputs_y_test,
                                 delimiter=',',
                                 dtype=str)

    try:
        assert (np.all(X_train.ravel() == expected_X_train))
    except AssertionError:
        message = "Didn't produce expected `X_train`"
        logging.exception(message)
        pytest.fail(message)

    try:
        assert (np.all(X_test.ravel() == expected_X_test))
    except AssertionError:
        message = "Didn't produce expected `X_test`"
        logging.exception(message)
        pytest.fail(message)

    try:
        assert (np.all(y_train.ravel() == expected_y_train))
    except AssertionError:
        message = "Didn't produce expected `y_train`"
        logging.exception(message)
        pytest.fail(message)

    try:
        assert (np.all(y_test.ravel() == expected_y_test))
    except AssertionError:
        message = "Didn't produce expected `y_test`"
        logging.exception(message)
        pytest.fail(message)
Ejemplo n.º 11
0
import argparse
import sys
from src.model import QueryModel
import logging
from src.logging import setup_logging

if __name__ == "__main__":
    setup_logging(logging.INFO)

    try:
        model_path = "models/trained.model"

        logging.info(f"Loading trained model from: {model_path}")

        query_model = QueryModel("models/trained.model", "legit")

        logging.debug("Parsing arguments")

        ap = argparse.ArgumentParser()
        ap.add_argument("domain", nargs = '*', help="Domain(s) to be test. Either single domain or comma separated")
        ap.add_argument("-i", "--interactive", action='store_true',
                        help="Enter interactive mode to type in many domains")

        args = vars(ap.parse_args())

        logging.debug(f"Arguments parsed: {args}")

        given_domain = args["domain"]
        result = None

        if args["interactive"]: