def get_predictions(path, rescaled, original, LSTM_ind=False, threshold_fixed=0.5): auto_encoder = load_model(path) # threshold_fixed = chose_weights_test_results(negative_weight,positive_weight,path,rescaled,original,LSTM_ind) valid_x_predictions = auto_encoder.predict(rescaled) if LSTM_ind: mse = np.mean(np.power( flatten(rescaled) - flatten(valid_x_predictions), 2), axis=1) else: mse = np.mean(np.power(rescaled - valid_x_predictions, 2), axis=1) if LSTM_ind: error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': list(original) }) else: error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': original['y'] }) pred_y = [ 1 if e > threshold_fixed else 0 for e in error_df.Reconstruction_error.values ] return pred_y
def chose_weights_test_results(negative_weight, positive_weight, path, rescaled, original, LSTM_ind=False): auto_encoder = load_model(path) # Predictions on validation set valid_x_predictions = auto_encoder.predict(rescaled) if LSTM_ind: mse = np.mean(np.power( flatten(rescaled) - flatten(valid_x_predictions), 2), axis=1) else: mse = np.mean(np.power(rescaled - valid_x_predictions, 2), axis=1) if LSTM_ind: error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': list(original) }) else: error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': original['y'] }) prob = [] cost_list = [] fp_values = [] fn_values = [] # Chose the threshold based on the validation set for i in [x / 100.0 for x in range(0, 40, 1)]: pred_y = [ 1 if e > i else 0 for e in error_df.Reconstruction_error.values ] true_y = list(map(int, error_df.True_class.values)) c1 = [ x and y for x, y in zip([x == 1 for x in pred_y], [x == 0 for x in true_y]) ] c2 = [ x and y for x, y in zip([x == 0 for x in pred_y], [x == 1 for x in true_y]) ] fp_values.append(sum(c1)) fn_values.append(sum(c2)) # Calculate cost based on weights. cost = np.sum( np.array(c1) * negative_weight + np.array(c2) * positive_weight) prob.append(i) cost_list.append(cost) return prob[cost_list.index(min(cost_list))]
def test_metrics_print(path, rescaled, original, LSTM_ind=False, threshold_fixed=0.5): auto_encoder = load_model(path) # threshold_fixed = chose_weights_test_results(negative_weight,positive_weight,path,rescaled,original,LSTM_ind) valid_x_predictions = auto_encoder.predict(rescaled) if LSTM_ind: mse = np.mean(np.power( flatten(rescaled) - flatten(valid_x_predictions), 2), axis=1) else: mse = np.mean(np.power(rescaled - valid_x_predictions, 2), axis=1) if LSTM_ind: error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': list(original) }) else: error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': original['y'] }) pred_y = [ 1 if e > threshold_fixed else 0 for e in error_df.Reconstruction_error.values ] predictions = pd.DataFrame({ 'true': error_df.True_class, 'predicted': pred_y }) conf_matrix = confusion_matrix(error_df.True_class, pred_y) fig = plt.figure(figsize=(8, 8)) LABELS = ["Normal", "Break"] sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d") plt.title("Confusion matrix") plt.ylabel('True class') plt.xlabel('Predicted class') fig.savefig('..\\results\FC_results\DL_results\conf.png') plt.show() tn, fp, fn, tp = confusion_matrix(error_df.True_class, pred_y).ravel() print('sensitivity', tp / (tp + fn) * 100) print('specificity', tn / (tn + fp) * 100) print('precision', tp / (tp + fp) * 100) print('accuracy', (tp + tn) / (tp + tn + fp + fn) * 100)
def roc_curve_plot( rescaled, original, path, LSTM_ind=False, ): auto_encoder = load_model(path) valid_x_predictions = auto_encoder.predict(rescaled) if LSTM_ind: mse = np.mean(np.power( flatten(rescaled) - flatten(valid_x_predictions), 2), axis=1) else: mse = np.mean(np.power(rescaled - valid_x_predictions, 2), axis=1) if LSTM_ind: error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': list(original) }) else: error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': original['y'] }) false_pos_rate, true_pos_rate, thresholds = roc_curve( error_df.True_class, error_df.Reconstruction_error) roc_auc = auc(false_pos_rate, true_pos_rate) fig = plt.figure(figsize=(8, 8)) plt.plot(false_pos_rate, true_pos_rate, linewidth=5, label='AUC = %0.3f' % roc_auc) plt.plot([0, 1], [0, 1], linewidth=5) plt.xlim([-0.01, 1]) plt.ylim([0, 1.01]) plt.legend(loc='lower right') plt.title( 'Receiver operating characteristic curve (ROC) on Validation set') plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') fig.savefig('..\\results\FC_results\DL_results\\roc.png') plt.show()
def calculate_auc(rescaled, original, path, LSTM_ind=False): model = load_model(path) valid_x_predictions = model.predict(rescaled) if LSTM_ind: mse = np.mean(np.power( flatten(rescaled) - flatten(valid_x_predictions), 2), axis=1) else: mse = np.mean(np.power(rescaled - valid_x_predictions, 2), axis=1) if LSTM_ind: error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': list(original) }) else: error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': original['y'] }) false_pos_rate, true_pos_rate, thresholds = roc_curve( error_df.True_class, error_df.Reconstruction_error) roc_auc = auc(false_pos_rate, true_pos_rate) return roc_auc
def get_question_info(url: str) -> List[Dict[str, Any]]: """Take in an episode and return a list of each question and its relevant information""" soup = BeautifulSoup(requests.get(url).text, 'html.parser') ep_id = int(re.findall('#(.*),', soup.title.text)[0]) rounds = soup.find_all(id=re.compile( 'jeopardy_round|double_jeopardy_round|final_jeopardy_round')) if len(rounds) == 0: return [] category_list = { _round['id']: [ category.text for category in _round.find_all('td', class_='category_name') ] for _round in rounds } round_clue_orders = [[ int(clue.find('td', class_='clue_order_number').text) - 1 if clue.find('td', class_='clue_order_number') else i for i, clue in enumerate(clue_sect.find_all('div')) ] for clue_sect in rounds] clue_order = [[ index[0] for index in sorted(enumerate(_round), key=lambda x: x[1]) ] for _round in round_clue_orders] order_iter = itertools.chain.from_iterable(clue_order) clue_chunks = [ list( zip(_round.find_all('div'), [ clue.text for clue in _round.find_all('td', class_='clue_text') ])) for _round in rounds ] clue_chunks = [ clue_set[next(order_iter)] for clue_set in clue_chunks for _, _ in enumerate(clue_set) ] #had to re-soupify the div tags to get unicode out of quotes. If I learn a better way to do this I'll change it div_tag_soup = [ BeautifulSoup(clue[0]['onmouseover'], 'html.parser') for clue in clue_chunks ] clue_rounds = [clue[0].find_parent('div')['id'] for clue in clue_chunks] is_fj = [_round == 'final_jeopardy_round' for _round in clue_rounds] contestants = itertools.chain.from_iterable([ div.find_all('td', class_=re.compile('wrong|right')) for div in div_tag_soup ]) contestants = list( set([ answerer.text for answerer in contestants if ('Triple Stumper' not in answerer.text) & ('Quadruple Stumper' not in answerer.text) ])) #A dictionary describing who answered each question and if they were right or wrong answerer_dicts = [ defaultdict(lambda: 'neither', [[contestant.text, contestant['class'][0]] for contestant in div.find_all('td', class_=re.compile('wrong|right')) if 'Triple Stumper' not in contestant.text]) for div in div_tag_soup ] value_tags = [ clue[0].find('td', class_=lambda text: 'clue_value' in text).text for i, clue in enumerate(clue_chunks) if not is_fj[i] ] #clue_values = [int(value.split('$')[1].replace(',', '')) for value in value_tags] clue_values = [ int(re.compile('[^\d,](?=\d)').split(value)[-1].replace(',', '')) for value in value_tags ] fj_index = [i for i, _ in enumerate(clue_chunks) if is_fj[i]] fj_values = [] fj_contestants = [] if len(fj_index) > 0: fj_values = [ [int(value.replace(',', '').replace('$', '').split('.')[0])] + [0] * (len(fj_index) - 1) for value in div_tag_soup[fj_index[0]].find_all( string=re.compile('^[\$\d][\d,]+$')) ] fj_contestants = [ contestant.text for contestant in div_tag_soup[ fj_index[0]].find_all('td', class_=re.compile('wrong|right')) ] fj_dict = dict(zip(fj_contestants, fj_values)) contestant_value_dict = { contestant: (clue_values + fj_dict[contestant] if fj_dict.get(contestant) is not None else (clue_values + [0] if len(fj_dict) > 0 else clue_values)) for contestant in contestants } contestant_score_dict = { contestant: [ value * value_sign_dict[answerer_dicts[i][contestant]] for i, value in enumerate(contestant_value_dict[contestant]) ] for contestant in contestants } contestant_scores = [ flatten([[contestant, contestant_score_dict[contestant][i]] for contestant in contestant_score_dict]) for i, _ in enumerate(contestant_score_dict[contestants[0]]) ] if len(contestant_scores[0]) < 8: contestant_scores = [ clue + [''] * (8 - len(clue)) for clue in contestant_scores ] score_keys = [] for i in range(4): score_keys.append('contestant_{}'.format(i + 1)) score_keys.append('c{}_score_update'.format(i + 1)) difficulty = [ int(clue[0].find('td', class_='clue_unstuck')['id'].split('_')[-2]) if clue[0].find('td', class_='clue_unstuck') is not None else 0 for clue in clue_chunks ] clue_columns = [ int(clue[0].find('td', class_='clue_unstuck')['id'].split('_')[-3]) - 1 if clue[0].find('td', class_='clue_unstuck') is not None else 0 for clue in clue_chunks ] clue_cats = [ category_list[_round][clue_columns[i]] for i, _round in enumerate(clue_rounds) ] season = season_dict[url] questions = [clue[1] for clue in clue_chunks] answers = [ clue.find('em', class_=lambda text: 'correct_response' in text).text for clue in div_tag_soup ] daily_double = [ 'DD' in value if not is_fj[i] else False for i, value in enumerate(value_tags) ] + ['False'] * len(fj_index) clues = [ dict( **{ 'season': season, 'ep_id': ep_id, 'question_id': i + 1, 'round': clue_rounds[i], 'category': clue_cats[i], 'difficulty': difficulty[i], 'question': questions[i], 'answer': answers[i], 'DD': daily_double[i] }, **dict(zip(score_keys, contestant_scores[i]))) for i, _ in enumerate(clue_chunks) ] return clues