def train_model( prob_thresh=load_config()["machine_learning"]["prboability_thresholds"]["general"], use_smote=load_config()["machine_learning"]["use_smote"] ): """Trains instance of scikit-learn's RandomForestClassifier model on the training dataset from project's root directory (typically produced by function ml.build_train_set) and saves trained model to root directory as well. Parameters: - `prob_thresh` (float): probability threshold which the classifier will use to determine whether or not there is a match. Scikit-learn's default threshold is 0.5 but this is being disregarded. Note that this threshold doesn't impact the actual training of the model - only its custom predictions and performance metrics. Default loads from config file. - `use_smote` (boolean): whether or not the SMOTE algorithm should be applied to the labeled data before training the model. Default loads from config file. Returns: - rc_cum (float): average recall - pr_cum (float): average precision - f1_cum (float): avergae f1 score """ logger.info("training random forest classifier") df = pd.read_csv("./train_set.csv") exclude_fetures = load_config()["machine_learning"]["exclude_features"] X = df[[x for x in df.columns if x.endswith("_score") and x not in exclude_fetures]] save_feature_list(X.columns) feature_list = list(X.columns) update_results({'features' : feature_list}) y = df[["ground_truth"]] clf = RandomForestClassifier(n_estimators=100, random_state=42) sm = SMOTE(random_state=42, sampling_strategy=1) kf = KFold(n_splits=3, shuffle=True, random_state=41) rc_cum, pr_cum, f1_cum = [], [], [] split_no = 0 for train_index, test_index in kf.split(X): split_no += 1 logger.info(f"K-Split #{split_no}...") X_train, X_test = X.values[train_index], X.values[test_index] y_train, y_test = y.values[train_index], y.values[test_index] if use_smote: X_train_final, y_train_final = sm.fit_sample(X_train, y_train) else: X_train_final, y_train_final = X_train, y_train clf.fit(X_train_final, y_train_final) prob = clf.predict_proba(X_test) pred = [1 if x >= prob_thresh else 0 for x in clf.predict_proba(X_test)[:, 1]] y_test = y_test.reshape( y_test.shape[0] ) # shitty little workaround required due to pandas -> numpy conversion results = pd.DataFrame( { "truth": y_test, "total_score": X_test[:, -1], "prob": prob[:, 1], "pred": pred, } ) rc = len(results[(results.truth == 1) & (results.pred == 1)]) / len( results[results.truth == 1] ) pr = len(results[(results.truth == 1) & (results.pred == 1)]) / len( results[results.pred == 1] ) f1 = f1_score(y_test, pred) logger.debug( f"number of truthes to learn from: {len([x for x in y_train if x==1])} out of {len(y_train)}" ) logger.debug(f"number of tests: {len(results[results.truth==1])}") logger.debug(f"recall: {round(rc, 3)}") logger.debug(f"precision: {round(pr, 3)}") logger.debug(f"f1 score: {round(f1, 3)}") rc_cum.append(rc) pr_cum.append(pr) f1_cum.append(f1) logger.debug(f"average recall: {round(sum(rc_cum)/len(rc_cum), 3)}") logger.debug(f"average precision: {round(sum(pr_cum)/len(pr_cum), 3)}") logger.debug(f"avergae f1 score: {round(sum(f1_cum)/len(f1_cum), 3)}") if use_smote: X_final, y_final = sm.fit_sample(X, y) else: X_final, y_final = X, y clf.fit(X_final, y_final) feat_imp = pd.DataFrame( {"feat": X.columns, "imp": clf.feature_importances_} ).sort_values("imp", ascending=False) logger.info("top features are:") for _, row in feat_imp.iterrows(): logger.info( "\t" + "{:<25}".format(row['feat']) + "\t" + str(round(row['imp']*100, 1)) + "\t" ) save_model(clf) return rc_cum, pr_cum, f1_cum
def validate_model( prob_thresh=load_config()["machine_learning"]["prboability_thresholds"]["general"], test=False ): """Compares new model with status quo production model and compiles/reports the results. Based on results, will either replace model and archive old one or just maintain status quo. Parameters: - `prob_thresh` (float): probability threshold which the classifier will use to determine whether or not there is a match. - `test` (bool): whether in testing or not, will dtermine flow of operations and mute emails appropriately. """ match_query = """ SELECT company_projects.job_number, company_projects.city, company_projects.address, company_projects.title, company_projects.owner, company_projects.contractor, company_projects.engineer, company_projects.address_lat, company_projects.address_lng, company_projects.receiver_emails_dump, web_certificates.url_key, web_certificates.cert_id, attempted_matches.ground_truth, attempted_matches.multi_phase, web_certificates.pub_date, web_certificates.source, CONCAT(base_urls.base_url, web_certificates.url_key) AS link FROM web_certificates LEFT JOIN attempted_matches ON web_certificates.cert_id = attempted_matches.cert_id LEFT JOIN company_projects ON attempted_matches.project_id = company_projects.project_id LEFT JOIN base_urls ON base_urls.source = web_certificates.source WHERE company_projects.closed=1 AND attempted_matches.ground_truth=1 AND attempted_matches.multi_phase=0 AND attempted_matches.validate=1 """ corr_web_certs_query = """ SELECT web_certificates.* FROM web_certificates LEFT JOIN attempted_matches ON web_certificates.cert_id = attempted_matches.cert_id LEFT JOIN company_projects ON attempted_matches.project_id = company_projects.project_id LEFT JOIN base_urls ON base_urls.source = web_certificates.source WHERE company_projects.closed=1 AND attempted_matches.ground_truth=1 AND attempted_matches.multi_phase=0 AND attempted_matches.validate=1 """ with create_connection() as conn: validate_company_projects = pd.read_sql(match_query, conn) validate_web_df = pd.read_sql(corr_web_certs_query, conn) new_results = match( version="new", company_projects=validate_company_projects, df_web=validate_web_df, test=True, prob_thresh=prob_thresh, ) analysis_df = pd.merge( new_results[['job_number', 'cert_id', 'pred_prob', 'pred_match', 'total_score']], validate_company_projects[['job_number', 'cert_id', 'ground_truth']], how='left', on=['job_number', 'cert_id'] ) analysis_df['ground_truth'] = analysis_df.ground_truth.apply(lambda x: 1 if x == 1.0 else 0) tp = len(analysis_df[(analysis_df.pred_match == 1) & (analysis_df.ground_truth == 1)]) fp = len(analysis_df[(analysis_df.pred_match == 1) & (analysis_df.ground_truth == 0)]) tn = len(analysis_df[(analysis_df.pred_match == 0) & (analysis_df.ground_truth == 0)]) fn = len(analysis_df[(analysis_df.pred_match == 0) & (analysis_df.ground_truth == 1)]) if fn: logger.warning(f"match for project #{list(analysis_df[(analysis_df.pred_match == 0) & (analysis_df.ground_truth == 1)]['job_number'])} was not detected.") logger.info(f"true postives: {tp}") logger.info(f"false postives: {fp}") logger.info(f"true negatives: {tn}") logger.info(f"false negatives: {fn}") recall = tp / (tp + fn) precision = tp / (tp + fp) logger.info(f"recall: {recall}") logger.info(f"precision: {precision}") min_prob = min(analysis_df[analysis_df.ground_truth == 1.0]['pred_prob']) logger.info(f"minimum probability threshhold to acheive 100% recall: {min_prob}") analysis_df['adj_pred_match'] = analysis_df.pred_prob.apply(lambda x: x >= min_prob) avg_prob = mean(analysis_df[analysis_df.ground_truth == 1.0]['pred_prob']) logger.debug(analysis_df[analysis_df.adj_pred_match]) signal_and_noise = analysis_df[analysis_df.pred_prob > -0.1] signal = signal_and_noise[signal_and_noise.ground_truth == 1.0]['pred_prob'] noise = signal_and_noise[signal_and_noise.ground_truth != 1.0]['pred_prob'] interval = 0.1 bottom_ranges = np.arange(0, 1, interval) ground_truths, false_matches = [], [] for bottom_range in bottom_ranges: bottom_range = round(bottom_range, 1) upper_range = round((bottom_range + interval), 1) if bottom_range == 0.0: # capture all the false matches scored at exactly 0 bottom_range = -0.1 ground_truths.append(len([value for value in signal if value <= upper_range and value > bottom_range])) false_matches.append(len([value for value in noise if value <= upper_range and value > bottom_range])) df = pd.DataFrame({ 'probability score' : bottom_ranges, 'true match' : ground_truths, 'false match' : false_matches }) p1 = plt.bar(df['probability score'], df['true match'], width=0.07, align='edge', color=(112/255, 94/255, 204/255, 1)) p2 = plt.bar(df['probability score'], df['false match'], width=0.07, align='edge', bottom=df['true match'], color=(112/255, 94/255, 134/255, 1)) t = plt.axvline(x=prob_thresh, color=(70/255, 70/255, 80/255, 1), linestyle='--') plt.ylabel('# of matches') plt.xlabel('predicted probability of match') ax = plt.axes() ax.xaxis.set_major_locator(MaxNLocator(integer=True)) # ax.set_yscale('log', nonposy='clip') # too glitchy to use plt.xticks([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]) plt.title('Precision Spread on Validation Data\n') plt.legend((p1[0], p2[0]), ('true match', 'false match')) # ax = plt.axes() # for spine in ax.spines: # ax.spines[spine].set_visible(False) legend = plt.legend((p1[0], p2[0], t), ('true match', 'false match', 'decision threshold'), frameon=1) frame = legend.get_frame() frame.set_alpha(0) if not test: # will also display inside jupyter notebook regardless (if %matplotlib inline) plt.savefig('static/precision_spread.png', transparent=True, dpi=300) if recall < 1.0: adj_tp = len(analysis_df[(analysis_df.adj_pred_match == 1) & (analysis_df.ground_truth == 1)]) adj_fp = len(analysis_df[(analysis_df.adj_pred_match == 1) & (analysis_df.ground_truth == 0)]) adj_tn = len(analysis_df[(analysis_df.adj_pred_match == 0) & (analysis_df.ground_truth == 0)]) adj_fn = len(analysis_df[(analysis_df.adj_pred_match == 0) & (analysis_df.ground_truth == 1)]) logger.info(f"adjusted true postives: {adj_tp}") logger.info(f"adjusted false postives: {adj_fp}") logger.info(f"adjusted true negatives: {adj_tn}") logger.info(f"adjusted false negatives: {adj_fn}") adj_recall = adj_tp / (adj_tp + adj_fn) adj_precision = adj_tp / (adj_tp + adj_fp) logger.info(f"adjusted recall: {adj_recall}") logger.info(f"adjusted precision: {adj_precision}") logger.info(f"Would have had {adj_fp} false positives ({adj_precision}% precision) if threshold was adjusted down to acheive 100%") try: sq_results = match( version="status_quo", company_projects=validate_company_projects, df_web=validate_web_df, test=True, prob_thresh=prob_thresh, ) except FileNotFoundError: logger.info( "could not find any status quo models to use for baseline validation." ) if not test: logger.info("adopting new model by default and skipping rest of validation") for filename in ["rf_model.pkl", "rf_features.pkl"]: os.rename("new_" + filename, filename) return # exit function because there is no basline to validate against else: logger.info( "will keep testing validation using new model as baseline. Just for testing purposes." ) sq_results = match( version="new", company_projects=validate_company_projects, df_web=validate_web_df, test=True, prob_thresh=prob_thresh, ) sq_analysis_df = pd.merge( sq_results[['job_number', 'cert_id', 'pred_prob', 'pred_match', 'total_score']], validate_company_projects[['job_number', 'cert_id', 'ground_truth']], how='left', on=['job_number', 'cert_id'] ) sq_analysis_df['ground_truth'] = sq_analysis_df.ground_truth.apply(lambda x: 1 if x == 1.0 else 0) sq_tp = len(sq_analysis_df[(sq_analysis_df.pred_match == 1) & (sq_analysis_df.ground_truth == 1)]) sq_fp = len(sq_analysis_df[(sq_analysis_df.pred_match == 1) & (sq_analysis_df.ground_truth == 0)]) sq_tn = len(sq_analysis_df[(sq_analysis_df.pred_match == 0) & (sq_analysis_df.ground_truth == 0)]) sq_fn = len(sq_analysis_df[(sq_analysis_df.pred_match == 0) & (sq_analysis_df.ground_truth == 1)]) if sq_fn: logger.warning(f"match for project #{list(sq_analysis_df[(sq_analysis_df.pred_match == 0) & (sq_analysis_df.ground_truth == 1)]['job_number'])} was not detected.") logger.info(f"true postives: {sq_tp}") logger.info(f"false postives: {sq_fp}") logger.info(f"true negatives: {sq_tn}") logger.info(f"false negatives: {sq_fn}") sq_recall = sq_tp / (sq_tp + sq_fn) sq_precision = sq_tp / (sq_tp + sq_fp) logger.info(f"recall: {sq_recall}") logger.info(f"precision: {sq_precision}") sq_min_prob = min(sq_analysis_df[sq_analysis_df.ground_truth == 1.0]['pred_prob']) logger.info(f"minimum probability threshhold to acheive 100% recall: {sq_min_prob}") sq_analysis_df['adj_pred_match'] = sq_analysis_df.pred_prob.apply(lambda x: x >= sq_min_prob) sq_avg_prob = mean(sq_analysis_df[sq_analysis_df.ground_truth == 1.0]['pred_prob']) logger.debug(sq_analysis_df[sq_analysis_df.adj_pred_match]) update_results({ "probability threshold": prob_thresh, "SMOTE": load_config()["machine_learning"]["use_smote"], "100% recall acheived" : True if int(recall) == 1 else False, 'minimum probability required for status quo model' : sq_min_prob, 'minimum probability required for new model' : min_prob, 'average probability required for status quo model' : sq_avg_prob, 'average probability required for new model' : avg_prob, 'false positives with status quo' : sq_fp, 'false positives with new' : fp, 'precision': precision, }) if recall < 1.0: logger.warning( "100% recall not acheived with new model - archiving it " "and maintaining status quo!" ) if test: logger.info("skipping files transfers because running in test mode") else: for artifact in ["model", "features"]: os.rename( f"new_rf_{artifact}.pkl", f"model_archive/rf_new_{artifact}-{datetime.datetime.now().date()}.pkl", ) else: logger.info("100% recall acheived! Adopting new model and archiving old one.") if test: logger.info("skipping files transfers because running in test mode") else: for artifact in ["model", "features"]: os.rename( f"rf_{artifact}.pkl", f"model_archive/rf_{artifact}-{datetime.datetime.now().date()}.pkl", ) os.rename(f"new_rf_{artifact}.pkl", f"rf_{artifact}.pkl") for metric, new, sq in zip( ("false positive(s)", "max threshold", "average prediction probability"), (fp, min_prob, avg_prob), (sq_fp, sq_min_prob, sq_avg_prob), ): if metric == "false positive(s)": if new <= sq: good_outcome = True else: good_outcome = False elif new >= sq: good_outcome = True else: good_outcome = False if good_outcome: logger.info( f"New model produced {new} {metric}, " f"which is better or equal to status quo of {sq}." ) else: logger.warning( f"Might want to investigate new model - new model produced " f"{new} {metric}, compared to status quo of {sq}" )
def match( company_projects=False, df_web=False, test=False, since="today", until="now", prob_thresh=load_config()["machine_learning"]["prboability_thresholds"]["general"], multi_phase_proned_thresh=load_config()["machine_learning"][ "prboability_thresholds" ]["multi_phase"], version="status_quo", ): """Combines company projects and web CSP certificates in all-to-all join, wrangles the rows, scores the rows as potential matches, runs each row through Random Forest model, and communicates results via log and, if deemed successful, email as well. TODO: THIS FUNCTION IS TOO LONG AND DOES WAY TOO MANY THINGS. MUST BE REFACTORED ASAP. Parameters: - `company_projects` (pd.DataFrame): specify dataframe of company projects to match instead of default, which is to retreive all open projects from `company_projects` table in databse. - `df_web` (pd.DataFrame): specify dataframe of CSP certificates to match instead of default, which is to retreive all open projects from `web_certificates` table in databse according to specified timeframe. - `test` (bool): whether in testing or not, will dtermine flow of operations and mute emails appropriately. - `since` (str of format `"yyyy-mm-dd"`): used in conjunction with `until` to specify timeframe to query database for `df_web`. Only used if `df_web` not specified. Special strings `"week_ago"`, `"day_ago"`, or `"today"` can be used instead. Range is inclusive of date specified. - `until` (str of format `"yyyy-mm-dd"`): used in conjunction with `since` to specify timeframe to query database for `df_web`. Only used if `df_web` not specified. Special string `"now"` can be used instead. Range is inclusive of date specified. - `prob_thresh` (float): probability threshold for decision boundary. - `multi_phase_proned_thresh` (float): probability threshold for projects which are identified as being at risk of having multiple phases, which will override the standard prob_thresh. This value should be set higher than prob_thresh. - `version` (str): default is `status_quo` but `new` can also be used for validating newly-trained models. Returns: - a Pandas DataFrame containing all of certificate info, project number it was attempted to be matched with, and score results. Length of dataframe should be the length of `company_projects` x `df_web`. Mostly used for testing purposes. - `False` if there were no CSP certificates available for timeframe specified through `since` and `until`. """ logger.info("matching...") if not isinstance(company_projects, pd.DataFrame): # company_projects == False open_query = "SELECT * FROM company_projects WHERE closed=0" with create_connection() as conn: company_projects = pd.read_sql(open_query, conn) company_projects = wrangle(company_projects) if not isinstance(df_web, pd.DataFrame): # df_web == False if since == "today": since = datetime.datetime.now().date() elif since == "day_ago": since = (datetime.datetime.now() - datetime.timedelta(1)).date() elif since == "week_ago": since = (datetime.datetime.now() - datetime.timedelta(7)).date() else: try: since = re.findall("\d{4}-\d{2}-\d{2}", since)[0] except KeyError: raise ValueError( "`since` parameter should be in the format yyyy-mm-dd if not a key_word" ) if until == "now": until = datetime.datetime.now() else: try: until = re.findall("\d{4}-\d{2}-\d{2}", until)[0] except KeyError: raise ValueError( "`until` parameter should be in the format yyyy-mm-dd if not a key_word" ) hist_query = """ SELECT * FROM web_certificates WHERE pub_date>=%s AND pub_date<=%s ORDER BY pub_date """ with create_connection() as conn: df_web = pd.read_sql(hist_query, conn, params=[since, until]) if ( len(df_web) == 0 ): # SQL query retunred nothing so no point of going any further logger.info( f"No new CSP's have been collected since last time `match()` was called ({since}). " f"Breaking out of match function." ) update_results({ 'match summary': 'nothing new to match', 'noteworthy matches' : {} }) return False df_web = wrangle(df_web) comm_count = 0 for _, company_project_row in company_projects.iterrows(): results = build_match_score( company_project_row.to_frame().transpose(), df_web, fresh_cert_limit=(not test) ) # .iterows returns a pd.Series for every row so this turns it back into a dataframe to avoid breaking any methods downstream logger.info( f"searching for potential match for project #{company_project_row['job_number']}..." ) results["job_number"] = company_project_row.job_number results["multi_phase_proned"] = results.apply( lambda row: 1 if any( re.findall( "campus|hospital|university|college", "".join(row[["city", "title"]].apply(str)), ) ) else 0, axis=1, ) results["pred_prob"] = results.apply( lambda row: predict_prob(row, version=version), axis=1 ) results["pred_match"] = results.apply( lambda row: predict_match( row.pred_prob, prob_thresh, row.multi_phase_proned, multi_phase_proned_thresh, ), axis=1, ) results = results.sort_values("pred_prob", ascending=False) logger.info( f"top 5 probabilities for project #{company_project_row['job_number']}: " f"{', '. join([str(round(x, 5)) for x in results.head(5).pred_prob])}" ) matches = results[results.pred_match == 1] if len(matches) > 0: logger.info( f"found {len(matches)} match{'' if len(matches)==1 else 'es'}! with " f"probability as high as {matches.iloc[0].pred_prob}" ) if not test: logger.info("getting ready to send notification...") communicate( matches.drop(matches.index[1:]), # sending only top result for now company_project_row, test=test, ) comm_count += 1 else: logger.info("didn't find any matches") try: results_master = results_master.append(results) except NameError: results_master = results logger.info( f"Done looping through {len(company_projects)} open projects. Sent {comm_count} " f"e-mails to communicate matches as a result." ) update_results({ 'match summary': f"matched {comm_count} out of {len(company_projects)} projects and {int(len(results_master)/len(company_projects))} CSP's", 'noteworthy matches' : results_master[results_master.pred_prob > 0.5][['cert_id','job_number', 'pred_prob', 'pred_match']].to_dict() }) return results_master
def get(self): self.game_id = 1 self.champ_dict = {'154': 'Zac', '133': 'Quinn', '131': 'Diana', '134': 'Syndra', '24': 'Jax', '25': 'Morgana', '26': 'Zilean', '27': 'Singed', '20': 'Nunu', '21': 'Miss Fortune', '22': 'Ashe', '23': 'Tryndamere', '28': 'Evelynn', '29': 'Twitch', '4': 'Twisted Fate', '8': 'Vladimir', '120': 'Hecarim', '121': "Kha'Zix", '122': 'Darius', '267': 'Nami', '126': 'Jayce', '59': 'Jarvan IV', '58': 'Renekton', '55': 'Katarina', '54': 'Malphite', '57': 'Maokai', '56': 'Nocturne', '51': 'Caitlyn', '50': 'Swain', '53': 'Blitzcrank', '412': 'Thresh', '115': 'Ziggs', '114': 'Fiora', '117': 'Lulu', '89': 'Leona', '111': 'Nautilus', '110': 'Varus', '113': 'Sejuani', '112': 'Viktor', '82': 'Mordekaiser', '83': 'Yorick', '80': 'Pantheon', '81': 'Ezreal', '86': 'Garen', '84': 'Akali', '85': 'Kennen', '3': 'Galio', '7': 'LeBlanc', '102': 'Shyvana', '103': 'Ahri', '101': 'Xerath', '106': 'Volibear', '107': 'Rengar', '104': 'Graves', '105': 'Fizz', '39': 'Irelia', '38': 'Kassadin', '33': 'Rammus', '32': 'Amumu', '31': "Chogath", '30': 'Karthus', '37': 'Sona', '36': 'Dr. Mundo', '35': 'Shaco', '34': 'Anivia', '60': 'Elise', '61': 'Orianna', '62': 'Wukong', '63': 'Brand', '64': 'Lee Sin', '67': 'Vayne', '68': 'Rumble', '69': 'Cassiopeia', '254': 'Vi', '2': 'Olaf', '6': 'Urgot', '99': 'Lux', '98': 'Shen', '91': 'Talon', '90': 'Malzahar', '92': 'Riven', '96': "Kog'Maw", '11': 'Master Yi', '10': 'Kayle', '13': 'Ryze', '12': 'Alistar', '15': 'Sivir', '14': 'Sion', '17': 'Teemo', '16': 'Soraka', '19': 'Warwick', '18': 'Tristana', '238': 'Zed', '48': 'Trundle', '119': 'Draven', '44': 'Taric', '45': 'Veigar', '42': 'Corki', '43': 'Karma', '40': 'Janna', '41': 'Gangplank', '1': 'Annie', '5': 'Xin Zhao', '9': 'Fiddlesticks', '143': 'Zyra', '77': 'Udyr', '76': 'Nidalee', '75': 'Nasus', '74': 'Heimerdinger', '72': 'Skarner', '79': 'Gragas', '78': 'Poppy'} self.summoner_dict = {'1': 'Boost', '2': 'Clairvoyance', '3': 'Exhaust', '4': 'Flash', '5': 'Fortify', '6': 'Haste', '7': 'Heal', '9': 'Rally', '10': 'Revive', '11': 'Smite', '12': 'Teleport', '14': 'Dot', '16': 'BattleCry', '17': 'OdinGarrison', '20': 'Promote', '21': 'Barrier'} update_results("Status", "order", self.get_status)
def get(self): update_results("IP", "ip", self.get_ip)
def get(self): sort_list = [("Honor", "total", self.get_honor), ("Honor_Friendly", "friendly", "HONOR"), ("Honor_Helpful", "helpful", "HONOR"), ("Honor_Teamwork", "teamwork", "HONOR"), ("Honor_Honorable", "honorable", "HONOR")] for s in sort_list: update_results(s[0], s[1], s[2]) # KEY, SORT_KEY, FUNCTION
def get(self): update_results("Leagues", "score", self.get_league)