def main(): args = docopt(__doc__, version=__version__, options_first=True) setup_logging() if args["<command>"] == "node": src.cli.node.main()
def test_query_model(): setup_logging(logging.DEBUG) logging.debug("\ntest_query_model") path = "integrationtests" test_model_path = f"{path}/models/trained.model" logging.debug(f"test_model_path: {test_model_path}") logging.debug("Creating instance of QueryModel") query_model = QueryModel(test_model_path, "legit") legit_examples = {"domain": ["richmondfc", "media-allrecipes", "reddit"]} legit_result = query_model.predict(legit_examples) logging.debug(f"legit_result: {legit_result}") try: assert legit_result except AssertionError: message = f"Didn't get expected prediction of all 'True' for: {legit_examples['domain']}" logging.exception(message) pytest.fail(message) dga_examples = { "domain": ["cgeoiyxoradbymu", "kbcejbpbduxyxrcqzxlxwdwclrqk", "dgrnntdplbrtg"] } dga_result = query_model.predict(dga_examples) logging.debug(f"dga_result: {dga_result}") try: assert not dga_result except AssertionError: message = f"Didn't get expected prediction of all 'False' for: {dga_examples['domain']}" logging.exception(message) pytest.fail(message) mixed_examples = { "domain": ["cgeoiyxoradbymu", "shipspotting", "rweulvobduttpzkbxsenfj"] } mixed_result = query_model.predict(mixed_examples) logging.debug(f"mixed_result: {mixed_result}") try: assert not mixed_result except AssertionError: message = f"Didn't get expected prediction of `False` for {mixed_examples['domain']}" logging.exception(message) pytest.fail(message)
def test_model(filename="models/trained.model"): """ Load the model specified and pass in some tricky examples :param filename: :return: """ setup_logging(logging.INFO) logging.debug("test_model") logging.debug(f"filename: {filename}") loaded_model = load_model(filename) blind_test = [ "google", "asx", "netflix", "stan", "youtube", "facebook", "bing", "duckduckgo", "kjhkhssf", "scikit-learn", generate_domain(1983, 7, 1), "reddit", "longestdomains", "zwpejkljhdpoqk", "pklwllpppqzibn", "stackoverflow", generate_domain(2019, 12, 17), generate_domain(1900, 3, 22), ] # create some permutations of date a long, long, time ago in a galaxy far, far, way test_gen = generate_domain(1983, 7, 1) for i in range(1, 6): val = test_gen[::i] if val: blind_test.append(test_gen[::i]) for j in range(1, 6): val = test_gen[j:i] if val: blind_test.append(test_gen[j:i]) logging.debug(blind_test) test_df = pd.DataFrame(blind_test, columns=["domain"]) logging.debug("About to make predictions on test_df") results = loaded_model.predict(test_df.values) for i, result in enumerate(results): logging.info(f"{blind_test[i]}: {result}")
def test_rescale(): """ Verify our generated features rescale as we expect :return: """ setup_logging(logging.DEBUG) logging.debug("\ntest_rescale") test_data_path = "integrationtests/test_feature_generation_expected.csv" expected_output_path = "integrationtests/test_rescale_expected.csv" logging.debug(f"test_data_path: {test_data_path}") logging.debug(f"expected_output_path: {expected_output_path}") logging.debug("Loading test data") in_data_df = pd.read_csv(test_data_path) pipeline = Pipeline([rescale()]) logging.debug("Applying rescale pipeline") pipeline_output = pipeline.fit_transform(in_data_df) try: assert (isinstance(pipeline_output, np.ndarray)) except AssertionError: message = f"`pipeline_output` was not of expected type `np.ndarray`, got: {type(pipeline_output)}" logging.exception(message) pytest.fail(message) with pytest.raises(AttributeError): pipeline["rescale"].get_feature_names() logging.debug("Converting pipeline output to DataFrame") result_df = pd.DataFrame(pipeline_output, columns=in_data_df.columns) logging.debug("Loading validation data") expected_df = pd.read_csv(expected_output_path) try: pd.testing.assert_frame_equal(result_df, expected_df, check_dtype=False) except AssertionError: message = "Rescaling Pipeline didn't produce expected results" logging.exception(message) pytest.fail(message)
def main_chart(): setup_logging() data_count = 300 data = get_history()[-data_count:] current = get_current_value() X = [d[0] for d in data] y = [d[1] for d in data] model = SVR_rbf(X, y) X.append(max(X) + 1) X = np.array(X).reshape(-1, 1) y_predicted_all = model.predict(X) y_predicted = y_predicted_all[-1] print("Current: {}".format(current)) print("Predicted: {}".format(y_predicted)) y.append(current) return render_template('line_chart.html', labels=list(X), values=y_predicted_all, values_real=y, legend1="Estimated", legend2="Real", estimated=y_predicted)
def test_feature_generation(): """ Verify that we generated the expected features after preprocessing :return: """ setup_logging(logging.DEBUG) logging.debug("test_feature_generation") test_data_path = "integrationtests/test_preprocessing_expected.csv" expected_output_path = "integrationtests/test_feature_generation_expected.csv" logging.debug(f"test_data_path: {test_data_path}") logging.debug(f"expected_output_path: {expected_output_path}") logging.debug("Loading test data") in_data_df = pd.read_csv(test_data_path) pipeline = Pipeline([ feature_generation(["domain"]) ]) logging.debug("Applying feature_generation pipeline") pipeline_output = pipeline.transform(in_data_df[["domain"]]) try: assert(isinstance(pipeline_output, np.ndarray)) except AssertionError: message = f"`pipeline_output` was not of expected type `np.ndarray`, got: {type(pipeline_output)}" logging.exception(message) pytest.fail(message) column_names = pipeline["feature_generation"].get_feature_names() logging.debug(f"column_names: {column_names}") expected_column_names = [ 'digit_ratio__domain_digit_ratio', 'len__domain_len', 'vowel_distance_mode_ratio__domain_vowel_distance_mode_ratio', 'vowel_distance_std_ratio__domain_vowel_distance_std_ratio', 'vowel_ratio__domain_vowel_ratio', 'consonants_variety_ratio__domain_consonants_variety_ratio', 'character_pairs__domain_character_pair_12', 'character_pairs__domain_character_pair_36', 'character_pairs__domain_character_pair_al', 'character_pairs__domain_character_pair_an', 'character_pairs__domain_character_pair_ar', 'character_pairs__domain_character_pair_ct', 'character_pairs__domain_character_pair_di', 'character_pairs__domain_character_pair_en', 'character_pairs__domain_character_pair_er', 'character_pairs__domain_character_pair_es', 'character_pairs__domain_character_pair_ff', 'character_pairs__domain_character_pair_ga', 'character_pairs__domain_character_pair_gg', 'character_pairs__domain_character_pair_he', 'character_pairs__domain_character_pair_in', 'character_pairs__domain_character_pair_jj', 'character_pairs__domain_character_pair_le', 'character_pairs__domain_character_pair_li', 'character_pairs__domain_character_pair_ma', 'character_pairs__domain_character_pair_me', 'character_pairs__domain_character_pair_ne', 'character_pairs__domain_character_pair_on', 'character_pairs__domain_character_pair_oo', 'character_pairs__domain_character_pair_or', 'character_pairs__domain_character_pair_pv', 'character_pairs__domain_character_pair_qq', 'character_pairs__domain_character_pair_ra', 'character_pairs__domain_character_pair_re', 'character_pairs__domain_character_pair_ss', 'character_pairs__domain_character_pair_st', 'character_pairs__domain_character_pair_te', 'character_pairs__domain_character_pair_ti', 'character_pairs__domain_character_pair_to', 'character_pairs__domain_character_pair_ve', 'character_pairs__domain_character_pair_vv', 'character_pairs__domain_character_pair_wc', 'character_pairs__domain_character_pair_we', ] logging.debug(f"expected_column_names: {expected_column_names}") try: assert(column_names == expected_column_names) except AssertionError: message = "`pipeline['feature_generation'].get_feature_names()` did not return expected values" logging.exception(message) pytest.fail(message) result_df = pd.DataFrame( pipeline_output, columns=column_names ) logging.debug("Loading validation DataFrame") expected_df = pd.read_csv(expected_output_path) try: pd.testing.assert_frame_equal( result_df, expected_df, check_dtype=False ) except AssertionError: message = "Feature Generation pipeline did not produce expected results" logging.exception(message) pytest.fail(message)
import argparse from copy import copy from sanic import Sanic, response from src.app_utils import generate_predictions, load, preprocess, validate_request from src.logging import setup_logging setup_logging() app = Sanic(__name__) MODEL_TYPE = None @app.route("/predict", methods=["POST"]) def predict(request): resp = {"model_type": MODEL_TYPE, "predictions": []} request_json = request.json validate_request(request_json) sentences = request_json["sentences"] preprocessed_seqs, seqs_len, seqs_oov_pctgs = preprocess(sentences) probabilities, sentiments = generate_predictions(preprocessed_seqs) for sentence, prob, sent, ps, seq_len, seq_oov_pctg in zip( sentences, probabilities, sentiments, preprocessed_seqs, seqs_len,
def test_preprocessing_pipeline(): """ Verify we can read in and preprocess the data as expected. :return: """ setup_logging(logging.DEBUG) logging.debug("test_preprocessing_pipeline") test_data_path = "integrationtests/test_data.csv" expected_output_path = "integrationtests/test_preprocessing_expected.csv" logging.debug(f"test_data_path: {test_data_path}") logging.debug(f"expected_output_path: {expected_output_path}") logging.debug("Loading test data") df = load_data(test_data_path, ["domain", "class"]) pipeline = Pipeline([ preprocess(), ]) logging.debug("Applying pipeline transformations") pipeline_output = pipeline.transform(df) column_names = pipeline["preprocess"].get_feature_names() logging.debug("Pipeline transformation complete") logging.debug(f"column_names: {column_names}") try: assert (column_names == ['class', 'domain']) except AssertionError: message = f"Didn't get the expected `get_feature_names` from pipeline, got {column_names}" logging.exception(message) pytest.fail(message) try: assert (isinstance(pipeline_output, np.ndarray)) except AssertionError: message = f"Didn't get expected type from pipeline, got {type(pipeline_output)}" logging.exception(message) pytest.fail(message) logging.debug(pipeline_output) logging.debug("Creating DataFrame from Pipeline output") result_df = pd.DataFrame(pipeline_output, columns=column_names) logging.debug("Applying `post_process_cleanup`") result_df = post_process_cleanup(result_df) logging.debug("Loading validation DataFrame") expected_df = pd.read_csv(expected_output_path) try: pd.testing.assert_frame_equal(result_df, expected_df, check_dtype=False) except AssertionError: message = "Data resulting from transformation did not match expected." logging.exception(message) pytest.fail(message) return pipeline
ap.add_argument("-t", "--test_size", required=False, help="Ratio of data to holdout for testing e.g. `0.3` for 30% - default `0.3`", default=0.3, type=float) ap.add_argument("-cv", "--cross_validation_folds", required=False, help="Number of cross validation folds e.g. 10 for 10 Folds - default `5`", default=5, type=int) ap.add_argument("-r", "--random_state", required=False, help="Specify the random state for reproducibility e.g. `42`", default=None, type=int) ap.add_argument("-v", "--verbose", required=False, help="Specify verbosity of training process e.g. `0` for no training updates", default=1, type=int) args = vars(ap.parse_args()) if not os.path.exists(args['output']): os.makedirs(args['output']) log_path = os.path.join(args['output'], "training.log") setup_logging(logging.DEBUG, file_name=log_path) try: logging.info(f"Logging training to: {log_path}") except FileNotFoundError: print(f"Unable to create log file at {log_path}, do you have the needed file system permissions?") sys.exit(1) train_model( args["path"], x_columns=args["input_columns"].split(","), y_column=args["target_column"], output_path=args["output"], test_size=args["test_size"], random_state=args["random_state"], cross_validation_folds=args["cross_validation_folds"],
def test_prepare_model_inputs(): """ Verify that our pipeline for creating model inputs works correctly. This pipeline is used both in training and prediction :return: """ setup_logging(logging.DEBUG) logging.debug("test_prepare_model_inputs") test_data_path = "integrationtests/test_data.csv" logging.debug(f"test_data_path: {test_data_path}") logging.debug("Loading test data") in_data_df = pd.read_csv(test_data_path) print(in_data_df.head()) logging.debug("Splitting into test and train sets by prepare_model_inputs") feature_names, X_train, X_test, y_train, y_test = prepare_model_inputs( in_data_df, ["domain"], "class", test_size=0.3, random_state_split=42) logging.debug(f"feature_names: {feature_names}") expected_feature_names = [ 'digit_ratio__domain_digit_ratio', 'len__domain_len', 'vowel_distance_mode_ratio__domain_vowel_distance_mode_ratio', 'vowel_distance_std_ratio__domain_vowel_distance_std_ratio', 'vowel_ratio__domain_vowel_ratio', 'consonants_variety_ratio__domain_consonants_variety_ratio', 'character_pairs__domain_character_pair_12', 'character_pairs__domain_character_pair_36', 'character_pairs__domain_character_pair_al', 'character_pairs__domain_character_pair_an', 'character_pairs__domain_character_pair_ar', 'character_pairs__domain_character_pair_ct', 'character_pairs__domain_character_pair_di', 'character_pairs__domain_character_pair_en', 'character_pairs__domain_character_pair_er', 'character_pairs__domain_character_pair_es', 'character_pairs__domain_character_pair_ff', 'character_pairs__domain_character_pair_ga', 'character_pairs__domain_character_pair_gg', 'character_pairs__domain_character_pair_he', 'character_pairs__domain_character_pair_in', 'character_pairs__domain_character_pair_jj', 'character_pairs__domain_character_pair_le', 'character_pairs__domain_character_pair_li', 'character_pairs__domain_character_pair_ma', 'character_pairs__domain_character_pair_me', 'character_pairs__domain_character_pair_ne', 'character_pairs__domain_character_pair_on', 'character_pairs__domain_character_pair_oo', 'character_pairs__domain_character_pair_or', 'character_pairs__domain_character_pair_pv', 'character_pairs__domain_character_pair_qq', 'character_pairs__domain_character_pair_ra', 'character_pairs__domain_character_pair_re', 'character_pairs__domain_character_pair_ss', 'character_pairs__domain_character_pair_st', 'character_pairs__domain_character_pair_te', 'character_pairs__domain_character_pair_ti', 'character_pairs__domain_character_pair_to', 'character_pairs__domain_character_pair_ve', 'character_pairs__domain_character_pair_vv', 'character_pairs__domain_character_pair_wc', 'character_pairs__domain_character_pair_we' ] logging.debug(f"expected_feature_names: {expected_feature_names}") try: assert (feature_names == expected_feature_names) except AssertionError: message = "`prepare_model_inputs` did not return expected feature names" logging.exception(message) pytest.fail(message) testing_column = None try: for x in [X_train, X_test, y_train, y_test]: testing_column = x assert (isinstance(x, np.ndarray)) except AssertionError: message = f"`prepare_model_inputs` didn't return expected types, got: {type(testing_column)}" logging.exception(message) pytest.fail(message) test_prepare_model_inputs_X_train = "integrationtests/test_prepare_model_inputs_X_train.csv" test_prepare_model_inputs_X_test = "integrationtests/test_prepare_model_inputs_X_test.csv" test_prepare_model_inputs_y_train = "integrationtests/test_prepare_model_inputs_y_train.csv" test_prepare_model_inputs_y_test = "integrationtests/test_prepare_model_inputs_y_test.csv" logging.debug("Loading validation split data sets") logging.debug( f"test_prepare_model_inputs_X_train: {test_prepare_model_inputs_X_train}" ) logging.debug( f"test_prepare_model_inputs_X_test: {test_prepare_model_inputs_X_test}" ) logging.debug( f"test_prepare_model_inputs_y_test: {test_prepare_model_inputs_y_test}" ) expected_X_train = np.loadtxt(test_prepare_model_inputs_X_train, delimiter=',', dtype=np.float64) expected_X_test = np.loadtxt(test_prepare_model_inputs_X_test, delimiter=',', dtype=np.float64) expected_y_train = np.loadtxt(test_prepare_model_inputs_y_train, delimiter=',', dtype=str) expected_y_test = np.loadtxt(test_prepare_model_inputs_y_test, delimiter=',', dtype=str) try: assert (np.all(X_train.ravel() == expected_X_train)) except AssertionError: message = "Didn't produce expected `X_train`" logging.exception(message) pytest.fail(message) try: assert (np.all(X_test.ravel() == expected_X_test)) except AssertionError: message = "Didn't produce expected `X_test`" logging.exception(message) pytest.fail(message) try: assert (np.all(y_train.ravel() == expected_y_train)) except AssertionError: message = "Didn't produce expected `y_train`" logging.exception(message) pytest.fail(message) try: assert (np.all(y_test.ravel() == expected_y_test)) except AssertionError: message = "Didn't produce expected `y_test`" logging.exception(message) pytest.fail(message)
import argparse import sys from src.model import QueryModel import logging from src.logging import setup_logging if __name__ == "__main__": setup_logging(logging.INFO) try: model_path = "models/trained.model" logging.info(f"Loading trained model from: {model_path}") query_model = QueryModel("models/trained.model", "legit") logging.debug("Parsing arguments") ap = argparse.ArgumentParser() ap.add_argument("domain", nargs = '*', help="Domain(s) to be test. Either single domain or comma separated") ap.add_argument("-i", "--interactive", action='store_true', help="Enter interactive mode to type in many domains") args = vars(ap.parse_args()) logging.debug(f"Arguments parsed: {args}") given_domain = args["domain"] result = None if args["interactive"]: