def accept_collection(db, col, col_temp, **kwargs): auth = authenticate.prediction_login(**kwargs) execution_date = parse_datetime(str(kwargs["execution_date"])) str_date = execution_date.strftime("%Y_%m_%d") archieved_col = "{}_{}".format(col, str_date) rename_collection(auth, db, col, archieved_col) rename_collection(auth, db, col_temp, col)
def pull_offer_matrix(**kwargs): auth = authenticate.prediction_login(**kwargs) delete_table(auth, config.offer_matrix_connection, config.offer_matrix_db, config.offer_matrix_col_temp) pull_table(auth, config.offer_matrix_connection, config.offer_matrix_db, config.offer_matrix_col_temp, config.offer_matrix_connection_remote, config.offer_matrix_db_remote, config.offer_matrix_col_remote)
def add_date_enrich(**kwargs): p_auth = authenticate.prediction_login(**kwargs) database = "medscheme" collection = "claims_2019_raw" attribute = "TREATMENT_DATE=yyyy-mm-dd hh:mm:ss.SSS" find = "{}" output = data_munging_engine.enrich_date2(p_auth, database, collection, attribute, find) print(output)
def get_db_list(**kwargs): # url = kwargs["dag_run"].conf.get("url") # username = kwargs["dag_run"].conf.get("username") # password = kwargs["dag_run"].conf.get("password") # p_auth = jwt_access.Authenticate(url, username, password) p_auth = authenticate.prediction_login(**kwargs) output = data_management_engine.get_document_db_list(p_auth) print(output)
def get_prediction_project(**kwargs): auth = authenticate.prediction_login(**kwargs) project_id = kwargs["dag_run"].conf.get("project_id") prediction_project = prediction_engine.get_prediction_project( auth, project_id) print(prediction_project) prediction_project["deployment_step"][0][ "deployment_id"] = "mood_recommender_dynamic_changed_automated" prediction_engine.save_prediction_project(auth, prediction_project)
def pull_lookup_feature_store(**kwargs): auth = authenticate.prediction_login(**kwargs) delete_table(auth, config.lookup_feature_store_connection, config.lookup_feature_store_db, config.lookup_feature_store_col_temp) pull_table(auth, config.lookup_feature_store_connection, config.lookup_feature_store_db, config.lookup_feature_store_col_temp, config.lookup_feature_store_connection_remote, config.lookup_feature_store_db_remote, config.lookup_feature_store_col_remote)
def read_data(**kwargs): p_auth = authenticate.prediction_login(**kwargs) database = "nlp_examples" collection = "nlp_example_text" field = "{}" limit = 0 projections = "{}" skip = 0 output = data_management_engine.get_data(p_auth, database, collection, field, limit, projections, skip) print(output)
def process_ecogenetic_network(**kwargs): p_auth = authenticate.prediction_login(**kwargs) data = kwargs["dag_run"].conf.get("enrichment")["__var"]["en"]["__var"] collection = data["collection"] collectionOut = data["collectionOut"] database = data["database"] find = data["find"] graphMeta = data["graphMeta"] graphParam = data["graphParam"] output = algorithm_client_pulse.process_ecogenetic_network(p_auth, collection, collectionOut, database, find, graphMeta, graphParam) print(output)
def personality_enrich(**kwargs): p_auth = authenticate.prediction_login(**kwargs) data = kwargs["dag_run"].conf.get("enrichment")["__var"]["enrich"]["__var"] category = data["category"] collection = data["collection"] collectionOut = data["collectionOut"] database = data["database"] find = data["find"] groupby = data["groupby"] output = data_munging_engine.personality_enrich(p_auth, category, collection, collectionOut, database, find, groupby) print(output)
def process_client_pulse_reliability(**kwargs): p_auth = authenticate.prediction_login(**kwargs) data = kwargs["dag_run"].conf.get("enrichment")["__var"]["cpr"]["__var"] collection = data["collection"] collectionOut = data["collectionOut"] database = data["database"] find = data["find"] groupby = data["groupby"] mongoAttribute = data["mongoAttribute"] rtype = data["type"] output = data_munging_engine.process_client_pulse_reliability(p_auth, collection, collectionOut, database, find, groupby, mongoAttribute, rtype) print(output)
def generate_time_series_features(**kwargs): p_auth = authenticate.prediction_login(**kwargs) data = kwargs["dag_run"].conf.get("enrichment")["__var"]["tsf"]["__var"] categoryfield = data["categoryfield"] collection = data["collection"] database = data["database"] datefield = data["datefield"] featureset = data["featureset"] find = data["find"] groupby = data["groupby"] numfield = data["numfield"] output = data_munging_engine.generate_time_series_features(p_auth, categoryfield, collection, database, datefield, featureset, find, groupby, numfield) print(output)
def generate_forecast(**kwargs): p_auth = authenticate.prediction_login(**kwargs) data = kwargs["dag_run"].conf.get("enrichment")["__var"]["fore"]["__var"] attribute = data["attribute"] collection = data["collection"] collectionOut = data["collectionOut"] database = data["database"] dateattribute = data["dateattribute"] find = data["find"] historicsteps = data["historicsteps"] steps = data["steps"] output = algorithm_client_pulse.generate_forecast(p_auth, attribute, collection, collectionOut, database, dateattribute, find, historicsteps, steps) print(output)
def pull_training_feature_store(**kwargs): execution_date = parse_datetime(str(kwargs["execution_date"])) print(execution_date) auth = authenticate.prediction_login(**kwargs) delete_table(auth, config.training_feature_store_connection, config.training_feature_store_db, config.training_feature_store_col_temp) pull_table(auth, config.training_feature_store_connection, config.training_feature_store_db, config.training_feature_store_col_temp, config.training_feature_store_connection_remote, config.training_feature_store_db_remote, config.training_feature_store_col_remote)
def prediction_enrich(**kwargs): p_auth = authenticate.prediction_login(**kwargs) data = kwargs["dag_run"].conf.get("enrichment")["__var"]["pred"]["__var"] attributes = data["attributes"] collection = data["collection"] limit = data["limit"] mongodb = data["mongodb"] predictor = data["predictor"] predictor_label = data["predictor_label"] search = data["search"] skip = data["skip"] sort = data["sort"] output = data_munging_engine.prediction_enrich_fast(p_auth, mongodb, collection, search, sort, predictor, predictor_label, attributes, skip, limit) print(output)
def get_multi_offer_csv(**kwargs): auth = authenticate.prediction_login(**kwargs) filename = "{}.csv".format(config.training_feature_store_col) filetype = "csv" database = config.training_feature_store_db collection = config.training_feature_store_col field = "{}" sort = "" projection = "{}" limit = 0 data_management_engine.export_documents(auth, filename, filetype, database, collection, field, sort, projection, limit) time.sleep(5)
def process_apriori(**kwargs): p_auth = authenticate.prediction_login(**kwargs) data = kwargs["dag_run"].conf.get("enrichment")["__var"]["apr"]["__var"] colItem = data["colItem"] collection = data["collection"] collectionOut = data["collectionOut"] custField = data["custField"] database = data["database"] dbItem = data["dbItem"] find = data["find"] itemField = data["itemField"] supportCount = data["supportCount"] output = algorithm_client_pulse.process_apriori(p_auth, colItem, collection, collectionOut, custField, database, dbItem, find, itemField, supportCount) print(output)
def get_distinct_offers(**kwargs): auth = authenticate.prediction_login(**kwargs) connection = "{}/master?user=admin".format( config.training_feature_store_connection_remote) sql = "select distinct {} from {}.{}.{}".format( config.training_feature_store_key_id, config.training_feature_store_db_remote, config.training_feature_store_col_remote) print(sql) result = data_management_engine.get_presto_sql(auth, connection, sql) offer_ids = [] for entry in result["data"]: offer_ids.append(entry["offer_id"]) print(offer_ids) return offer_ids
def list_collections(**kwargs): p_auth = authenticate.prediction_login(**kwargs) database = "nlp_examples" output = data_management_engine.get_document_db_collections( p_auth, database) print(output)
def preprocess_generate_time_series_features(**kwargs): p_auth = authenticate.prediction_login(**kwargs) value = kwargs["dag_run"].conf.get("prepare")["__var"]["tsf"] print(value)
def preprocess_personality_enrich(**kwargs): p_auth = authenticate.prediction_login(**kwargs) value = kwargs["dag_run"].conf.get("prepare")["__var"]["enrich"] print(value)
def push_offer_matrix(**kwargs): auth = authenticate.prediction_login(**kwargs) pull_table(auth, config.offer_matrix_connection, config.offer_matrix_db, config.offer_matrix_col, config.offer_matrix_connection, config.offer_matrix_db, config.offer_matrix_col)
def train_multi_models(**kwargs): auth = authenticate.prediction_login(**kwargs) data_type_changes = config.training_feature_store_data_type_changes offer_ids = get_distinct_offers(**kwargs) offer_to_models = {} for offer in offer_ids: offer_to_models[offer] = None # Ingest csv as userframe file_name = "{}.csv".format(config.training_feature_store_col) featurestore_name = "fs_{}".format(config.training_feature_store_col) hexframename, imp = multimodel_functions.save_file_as_userframe( auth, file_name, featurestore_name, username, data_type_changes) # Split frame into training and validation frames split_ratio = config.training_split_ratio inv_split_ratio = 1 - split_ratio str_sr = "_".join(str(split_ratio).split(".")) inv_str_sr = "_".join(str(inv_split_ratio).split(".")) worker_h2o.split_frame(auth, hexframename, split_ratio) hexframename_train = "{}{}.hex".format(hexframename[:-4], inv_str_sr) hexframename_score = "{}{}.hex".format(hexframename[:-4], str_sr) for offer in offer_ids: # Train Model version = config.model_version model_id = featurestore_name + version offer_to_models[str(offer)] = model_id model_purpose = config.model_purpose description = config.model_description model_params = { "predict_id": featurestore_name, "description": description, "model_id": model_id, "model_type": "AUTOML", "frame_name": hexframename, "frame_name_desc": description, "model_purpose": model_purpose, "version": version, "model_parms": { "algo": "H2O-AUTOML", "training_frame": hexframename_train, "validation_frame": hexframename_score, "response_column": config.response_column_template.format(str(offer)), "max_models": config.max_models, "stopping_tolerance": config.stopping_tolerance, "note_stop": "stopping_tolerance of 0.001 for 1m rows and 0.004 for 100k rows", "max_runtime_secs": config.max_runtime_secs, "stopping_rounds": config.stopping_rounds, "stopping_rounds_desc": "Early stopping based on convergence of stopping_metric. Stop if simple moving average of length k of the stopping_metric does not improve for k:=stopping_rounds scoring events (0 to disable).", "sort_metric": config.sort_metric, "stopping_metric": config.stopping_metric, "balance_classes": config.balance_classes, "balance_classes_desc": "Balance training data class counts via over/under-sampling (for imbalanced data).", "keep_cross_validation_predictions": config.keep_cross_validation_predictions, "keep_cross_validation_predictions_desc": "Whether to keep the predictions of the cross-validation predictions. This needs to be set to TRUE if running the same AutoML object for repeated runs because CV predictions are required to build additional Stacked Ensemble models in AutoML.", "keep_cross_validation_models": config.keep_cross_validation_models, "keep_cross_validation_models_desc": "Whether to keep the cross-validated models. Keeping cross-validation models may consume significantly more memory in the H2O cluster.", "keep_cross_validation_fold_assignment": config.keep_cross_validation_fold_assignment, "nfolds": config.nfolds, "note_folds": "nfolds=0 will disable the stacked ensemble creation process", "ignored_columns": config.ignored_columns, "exclude_algos": config.exlude_algos, "hidden": ["1"] } } prediction_engine.save_prediction(auth, model_params) worker_h2o.train_model(auth, model_id, "automl", json.dumps(model_params["model_parms"])) for offer in offer_ids: model_data = worker_h2o.get_train_model(auth, offer_to_models[str(offer)], "AUTOML") df = multimodel_functions.list_automl_models(model_data) h2o_name = multimodel_functions.save_best_model( auth, df, rename="offer_{}".format(offer)) from_path = "/data/models/{}.zip".format(h2o_name) to_path = "/data/deployed/{}.zip".format(offer) result = copy_file(auth, from_path, to_path, user="") print(result)
def preprocess_client_pulse_reliability(**kwargs): p_auth = authenticate.prediction_login(**kwargs) value = kwargs["dag_run"].conf.get("prepare")["__var"]["cpr"] print(value)
def verify_offer_matrix(**kwargs): auth = authenticate.prediction_login(**kwargs) verify_table(auth, config.offer_matrix_connection, config.offer_matrix_db, config.offer_matrix_col_temp, config.offer_matrix_must_exist_columns)
def preprocess_ecogenetic_network(**kwargs): p_auth = authenticate.prediction_login(**kwargs) value = kwargs["dag_run"].conf.get("prepare")["__var"]["en"] print(value)
def combine_provider_data(**kwargs): mongo_connect = kwargs["dag_run"].conf.get("mongo_connect") database = kwargs["dag_run"].conf.get("database") from_date = kwargs["dag_run"].conf.get("from_date") to_date = kwargs["dag_run"].conf.get("to_date") speciality_list = kwargs["dag_run"].conf.get("speciality_list") field_to_check_one = kwargs["dag_run"].conf.get("field_check_one") field_to_check_two = kwargs["dag_run"].conf.get("field_check_two") p_auth = authenticate.prediction_login(**kwargs) client = pymongo.MongoClient(mongo_connect) db = client[database] for i in speciality_list: speciality_name = i[0].replace("-","").replace(" ","").replace(",","").replace("&","").lower() #Generate the name for the filtered claim collection claims_data_to_process = "claims_" claims_data_to_process = claims_data_to_process+speciality_name claims_data_to_process = claims_data_to_process+"_"+from_date+to_date claims_data_to_process = claims_data_to_process.replace("-","").replace(" ","").replace(",","").replace("&","").lower() provider_collection = "provider_" provider_collection = provider_collection+speciality_name provider_collection = provider_collection+"_"+from_date+to_date provider_collection = provider_collection.replace("-","").replace(" ","").replace(",","").replace("&","").lower() provider_details = [ { "$group":{ "_id":"$PRACTICE_NUMBER" ,"speciality_array":{"$addToSet":"$SPECIALITY_DESCRIPTION"} ,"PROVINCE_STATE":{"$addToSet":"$PROVINCE_STATE"} ,"city_array":{"$addToSet":"$peer_location"} ,"min_time":{"$sum":"$min_time"} ,"max_time":{"$sum":"$max_time"} } } ,{ "$addFields":{ "PRACTICE_NUMBER":"$_id" ,"count_speciality":{"$size":"$speciality_array"} ,"SPECIALITY":{"$arrayElemAt":["$speciality_array",0]} ,"count_city":{"$size":"$city_array"} ,"peer_location":{"$arrayElemAt":["$city_array",0]} ,"PROVINCE_STATE":{"$arrayElemAt":["$PROVINCE_STATE",0]} } } ,{"$unset":"speciality_array"} ,{"$unset":"city_array"} ,{"$out":provider_collection} ] db[claims_data_to_process].aggregate(provider_details) from_date = from_date.replace("-","") to_date = to_date.replace("-","") claims_rollup_all_norm = "rollup_all_claims_"+speciality_name+"_"+from_date+to_date+"_fill_norm" print("Here",claims_rollup_all_norm) demographic_pipeline = [ { "$lookup":{ "from":claims_rollup_all_norm ,"localField":"PRACTICE_NUMBER" ,"foreignField":"_id" ,"as":"subs" } } ,{ "$replaceWith":{ "$mergeObjects":[{"$arrayElemAt":["$subs",0]},"$$ROOT"] } } ,{"$unset":"subs"} ,{"$out":provider_collection} ] db[provider_collection].aggregate(demographic_pipeline) #Add PCA factors for codeDiagnosis labels = ['li','pa','pe','cl'] tabs = [ "rollup_lines_claims_"+speciality_name+"_"+from_date+to_date+"_"+field_to_check_one+"_fill_norm" ,"rollup_pay_claims_"+speciality_name+"_"+from_date+to_date+"_"+field_to_check_one+"_fill_norm" ,"rollup_patients_claims_"+speciality_name+"_"+from_date+to_date+"_"+field_to_check_one+"_fill_norm" ,"rollup_claims_claims_"+speciality_name+"_"+from_date+to_date+"_"+field_to_check_one+"_fill_norm" ] print(tabs) for i in range(len(labels)): pca_join_pipeline = [ { "$lookup":{ "from":tabs[i]+"_PCA" ,"localField":"PRACTICE_NUMBER" ,"foreignField":"_id" ,"as":"subs" } } ] addDict = {"$addFields":{}} for j in range(60): addDict["$addFields"][labels[i]+"po"+str(j)+"T"] = {"$arrayElemAt":["$subs.PC"+str(j) ,0]} pca_join_pipeline.append(addDict) pca_join_pipeline.append({"$unset":"subs"}) pca_join_pipeline.append({"$out":provider_collection}) db[provider_collection].aggregate(pca_join_pipeline) #Add PCA factors for ageDiagnosis labels = ['li','pa','pe','cl'] tabs = [ "rollup_lines_claims_"+speciality_name+"_"+from_date+to_date+"_"+field_to_check_two+"_fill_norm" ,"rollup_pay_claims_"+speciality_name+"_"+from_date+to_date+"_"+field_to_check_two+"_fill_norm" ,"rollup_patients_claims_"+speciality_name+"_"+from_date+to_date+"_"+field_to_check_two+"_fill_norm" ,"rollup_claims_claims_"+speciality_name+"_"+from_date+to_date+"_"+field_to_check_two+"_fill_norm" ] print(tabs) for i in range(len(labels)): pca_join_pipeline = [ { "$lookup":{ "from":tabs[i]+"_PCA" ,"localField":"PRACTICE_NUMBER" ,"foreignField":"_id" ,"as":"subs" } } ] addDict = {"$addFields":{}} for j in range(60): addDict["$addFields"][labels[i]+"pt"+str(j)+"T"] = {"$arrayElemAt":["$subs.PC"+str(j) ,0]} pca_join_pipeline.append(addDict) pca_join_pipeline.append({"$unset":"subs"}) pca_join_pipeline.append({"$out":provider_collection}) db[provider_collection].aggregate(pca_join_pipeline) #Add fraud indicators fraud_pipeline = [ { "$lookup":{ "from":"fraud_working" ,"localField":"PRACTICE_NUMBER" ,"foreignField":"PRACTICE_NUMBER" ,"as":"subs" } } ,{ "$addFields":{ "provider_fraudulent":{"$cond":[{"$eq":["$subs",[]]},0,1]} } } ,{ "$unset":"subs" } ,{"$out":provider_collection} ] db[provider_collection].aggregate(fraud_pipeline) ru.export_collection(provider_collection,db,p_auth)
def preprocess_apriori(**kwargs): p_auth = authenticate.prediction_login(**kwargs) value = kwargs["dag_run"].conf.get("prepare")["__var"]["apr"] print(value)
def verify_lookup_feature_store(**kwargs): auth = authenticate.prediction_login(**kwargs) verify_table(auth, config.lookup_feature_store_connection, config.lookup_feature_store_db, config.lookup_feature_store_col_temp, config.lookup_feature_store_must_exist_columns)
def preprocess_generate_forecast(**kwargs): p_auth = authenticate.prediction_login(**kwargs) value = kwargs["dag_run"].conf.get("prepare")["__var"]["fore"] print(value)
def rollup_by_fc_and_pca(**kwargs): mongo_connect = kwargs["dag_run"].conf.get("mongo_connect") database = kwargs["dag_run"].conf.get("database") from_date = kwargs["dag_run"].conf.get("from_date") to_date = kwargs["dag_run"].conf.get("to_date") speciality_list = kwargs["dag_run"].conf.get("speciality_list") rollUpFieldsAbbreviation = kwargs["dag_run"].conf.get("rollUpFieldsAbbreviation") field_check_one = kwargs["dag_run"].conf.get("field_check_one") field_check_two = kwargs["dag_run"].conf.get("field_check_two") p_auth = authenticate.prediction_login(**kwargs) client = pymongo.MongoClient(mongo_connect) db = client[database] for field_to_check in [field_check_one,field_check_two]: for i in speciality_list: speciality_name = i[0].replace("-","").replace(" ","").replace(",","").replace("&","").lower() #Generate the name for the filtered claim collection claims_data_to_process = "claims_" claims_data_to_process = claims_data_to_process+speciality_name claims_data_to_process = claims_data_to_process+"_"+from_date+to_date claims_data_to_process = claims_data_to_process.replace("-","").replace(" ","").replace(",","").replace("&","").lower() claims_data_to_process_common_one = claims_data_to_process+"_common_"+field_to_check rollUpFields = [field_to_check] for k in rollUpFields: db[claims_data_to_process_common_one].create_index(k) claims_data_rollup_lines_one = "rollup_lines_"+claims_data_to_process+"_"+field_to_check claims_data_rollup_pay_one = "rollup_pay_"+claims_data_to_process+"_"+field_to_check claims_data_rollup_patients_one = "rollup_patients_"+claims_data_to_process+"_"+field_to_check claims_data_rollup_claims_one = "rollup_claims_"+claims_data_to_process+"_"+field_to_check ru.rollup_by_lines(claims_data_to_process_common_one,claims_data_rollup_lines_one,db,rollUpFields,rollUpFieldsAbbreviation) ru.rollup_by_payment(claims_data_to_process_common_one,claims_data_rollup_pay_one,db,rollUpFields,rollUpFieldsAbbreviation) ru.rollup_by_patients(claims_data_to_process_common_one,claims_data_rollup_patients_one,db,rollUpFields,rollUpFieldsAbbreviation) ru.rollup_by_claims(claims_data_to_process_common_one,claims_data_rollup_claims_one,db,rollUpFields,rollUpFieldsAbbreviation) claims_data_rollup_lines_one = ru.fill_collection(claims_data_rollup_lines_one,db) claims_data_rollup_pay_one = ru.fill_collection(claims_data_rollup_pay_one,db) claims_data_rollup_patients_one = ru.fill_collection(claims_data_rollup_patients_one,db) claims_data_rollup_claims_one = ru.fill_collection(claims_data_rollup_claims_one,db) claims_data_rollup_lines_norm_one = ru.norm_collection(claims_data_rollup_lines_one,"liT",db) claims_data_rollup_pay_norm_one = ru.norm_collection(claims_data_rollup_pay_one,"paT",db) claims_data_rollup_patients_norm_one = ru.norm_collection(claims_data_rollup_patients_one,"peT",db) claims_data_rollup_claims_norm_one = ru.norm_collection(claims_data_rollup_claims_one,"clT",db) pca_table = claims_data_rollup_lines_norm_one+"_PCA" db[pca_table].drop() pca_table = claims_data_rollup_pay_norm_one+"_PCA" db[pca_table].drop() pca_table = claims_data_rollup_patients_norm_one+"_PCA" db[pca_table].drop() pca_table = claims_data_rollup_claims_norm_one+"_PCA" db[pca_table].drop() contrib_table = "pca_norm_"+speciality_name.lower()+field_to_check+"_contrib_oot" db[contrib_table].drop() ru.run_pca(claims_data_rollup_lines_norm_one,"lines",["_id","liT","countProviders"],field_to_check,db,p_auth,speciality_name) ru.run_pca(claims_data_rollup_pay_norm_one,"pay",["_id","paT","countProviders"],field_to_check,db,p_auth,speciality_name) ru.run_pca(claims_data_rollup_patients_norm_one,"patients",["_id","peT","countProviders"],field_to_check,db,p_auth,speciality_name) ru.run_pca(claims_data_rollup_claims_norm_one,"claims",["_id","clT","countProviders"],field_to_check,db,p_auth,speciality_name)