def get_data(): global anomalies_pos_fpath global anomalies_neg_fpath global domain_dims global explantions_file_path global embedding_data_path global serialID_mapping_loc # ============================================ anom_pos_df = pd.read_csv(anomalies_pos_fpath, index_col=None) anom_neg_df = pd.read_csv(anomalies_neg_fpath, index_col=None) serialID_to_entityID = get_serialID_to_entityID() print('Setting up record class embedding...', embedding_data_path) record_class.__setup_embedding__(embedding_data_path, serialID_to_entityID, _normalize=True) main_data_df = pd.concat([anom_pos_df, anom_neg_df], axis=0) # main_data_df has the records with entity ids obj_list = [] for i in tqdm(range(anom_neg_df.shape[0])): obj = record_class(anom_neg_df.iloc[i].to_dict(), -1) obj.calc_features() obj_list.append(obj) for i in tqdm(range(anom_pos_df.shape[0])): obj = record_class(anom_pos_df.iloc[i].to_dict(), 1) obj.calc_features() obj_list.append(obj) # Read in the explantions with open(explantions_file_path, 'rb') as fh: explanations = json.load(fh) explanations = { int(k): [sorted(_) for _ in v] for k, v in explanations.items() } data_x = [] data_x_features = [] data_id = [] data_label = [] data_ID_to_matrix = {} for _obj in obj_list: data_x.append(_obj.x) data_id.append(_obj.id) data_label.append(_obj.label) data_ID_to_matrix[_obj.id] = _obj.features data_x_features.append(_obj.features) data_x = np.stack(data_x) data_label = np.array(data_label) data_id = np.array(data_id) return main_data_df, explanations, data_id, data_x, data_label, data_x_features, data_ID_to_matrix
def obtain_normal_samples(): global test_data_serialized_loc normal_data = pd.read_csv(test_data_serialized_loc, index_col=None) _df = normal_data.sample(5000) obj_list = [] for i in tqdm(range(_df.shape[0])): obj = record_class(_df.iloc[i].to_dict(), -1) obj_list.append(obj) data_x = [] for _obj in obj_list: data_x.append(_obj.x) data_x = np.stack(data_x) return data_x
def main_executor(): global explantions_file_path global embedding_data_path global serialID_mapping_loc global anomalies_pos_fpath global anomalies_neg_fpath global domain_dims global test_data_serialized_loc global feedback_batch_size global top_K_count # ============================================ anom_pos_df = pd.read_csv(anomalies_pos_fpath, index_col=None) anom_neg_df = pd.read_csv(anomalies_neg_fpath, index_col=None) # ============================================ # setup objects serialID_to_entityID = get_serialID_to_entityID() record_class.__setup_embedding__(embedding_data_path, serialID_to_entityID, _normalize=True) emb_dim = record_class.embedding['HSCode'].shape[1] # ------------------------------------------- obj_list = [] for i in tqdm(range(anom_neg_df.shape[0])): obj = record_class(anom_neg_df.iloc[i].to_dict(), -1) obj_list.append(obj) for i in tqdm(range(anom_pos_df.shape[0])): obj = record_class(anom_pos_df.iloc[i].to_dict(), 1) obj_list.append(obj) print(explantions_file_path) print(os.getcwd()) # Read in the explantions with open(explantions_file_path, 'rb') as fh: explanations = json.load(fh) explanations = { int(k): [sorted(_) for _ in v] for k, v in explanations.items() } num_domains = len(domain_dims) domain_idx = {e[0]: e[1] for e in enumerate(domain_dims.keys())} domain_list = list(domain_dims.keys()) domainInteraction_index = {} k = 0 for i in range(num_domains): for j in range(i + 1, num_domains): domainInteraction_index['_'.join( (domain_idx[i], domain_idx[j]))] = k k += 1 data_x = [] data_id = [] data_label = [] data_ID_to_matrix = {} for _obj in obj_list: data_x.append(_obj.x) data_id.append(_obj.id) data_label.append(_obj.label) data_ID_to_matrix[_obj.id] = _obj.x data_x = np.stack(data_x) data_label = np.array(data_label) data_id = np.array(data_id) idx = np.arange(len(data_id), dtype=int) np.random.shuffle(idx) data_x = data_x[idx] data_label = data_label[idx] data_id = data_id[idx] X_0 = data_x # Relevant anomalies X_1 = obtain_normal_samples() # Nominal y_0 = np.ones(X_0.shape[0]) y_1 = -1 * np.ones(X_1.shape[0]) y = np.hstack([y_0, y_1]) X = np.vstack([X_0, X_1]) num_coeff = len(domainInteraction_index) classifier_obj = get_trained_classifier(X, y, num_domains, emb_dim) W = classifier_obj.W.cpu().data.numpy() emb_dim = W.shape[-1] # classifier_obj.predict_score_op(X_0) # Create a referece dataframe :: data_reference_df working_df = pd.DataFrame(data=np.vstack([data_id, data_label]).transpose(), columns=['PanjivaRecordID', 'label']) working_df['baseID'] = working_df['PanjivaRecordID'].apply( lambda x: str(x)[:-3]) working_df['expl_1'] = -1 working_df['expl_2'] = -1 working_df['original_score'] = 1 for i, row in working_df.iterrows(): _id = int(row['PanjivaRecordID']) if _id in explanations.keys(): entry = explanations[_id] domain_1 = entry[0][0] domain_2 = entry[0][1] working_df.loc[i, 'expl_1'] = domainInteraction_index['_'.join( sorted([domain_1, domain_2]))] domain_1 = entry[1][0] domain_2 = entry[1][1] working_df.loc[i, 'expl_2'] = domainInteraction_index['_'.join( sorted([domain_1, domain_2]))] _x = data_ID_to_matrix[_id] working_df.loc[i, 'original_score'] = classifier_obj.predict_score_op( np.array([_x]))[0] working_df['cur_score'] = working_df['original_score'].values data_reference_df = working_df.copy() # To get random results results_with_input = pd.DataFrame(columns=['idx', 'acc']) results_no_input = pd.DataFrame(columns=['idx', 'acc']) cur_df = data_reference_df.copy() # Randomization cur_df = cur_df.sample(frac=1).reset_index(drop=True) acc = execute_with_input(clf_obj=copy.deepcopy(classifier_obj), working_df=cur_df, domainInteraction_index=domainInteraction_index, num_coeff=num_coeff, emb_dim=emb_dim, data_ID_to_matrix=data_ID_to_matrix, check_next=top_K_count, batch_size=feedback_batch_size) _tmpdf = pd.DataFrame([(e[0], e[1]) for e in enumerate(acc)], columns=['idx', 'acc']) results_with_input = results_with_input.append(_tmpdf, ignore_index=True) acc = execute_without_input(working_df=cur_df, check_next=top_K_count, batch_size=feedback_batch_size) _tmpdf = pd.DataFrame([(e[0], e[1]) for e in enumerate(acc)], columns=['idx', 'acc']) results_no_input = results_no_input.append(_tmpdf, ignore_index=True) return results_with_input, results_no_input
# ============================================ anom_pos_df = pd.read_csv(anomalies_pos_fpath, index_col=None) anom_neg_df = pd.read_csv(anomalies_neg_fpath, index_col=None) # ============================================ # setup objects serialID_to_entityID = get_serialID_to_entityID() record_class.__setup_embedding__(embedding_data_path, serialID_to_entityID, _normalize=True) emb_dim = record_class.embedding['HSCode'].shape[1] # ------------------------------------------- obj_list = [] for i in tqdm(range(anom_neg_df.shape[0])): obj = record_class(anom_neg_df.iloc[i].to_dict(),-1) obj_list.append(obj) for i in tqdm(range(anom_pos_df.shape[0])): obj = record_class(anom_pos_df.iloc[i].to_dict(),1) obj_list.append(obj) # Read in the explantions with open(explantions_file_path,'rt') as fh: explanations = json.load(fh) explanations = { int(k): [sorted (_) for _ in v] for k,v in explanations.items()} num_domains = len(domain_dims) domain_idx = { e[0]:e[1] for e in enumerate(domain_dims.keys())} domain_list = list(domain_dims.keys()) domainInteraction_index = {}
def main_executor(): global explantions_file_path global embedding_data_path global serialID_mapping_loc global anomalies_pos_fpath global anomalies_neg_fpath global domain_dims global test_data_serialized_loc global feedback_batch_size global DIR # ============================================ anom_pos_df = pd.read_csv(anomalies_pos_fpath, index_col=None) anom_neg_df = pd.read_csv(anomalies_neg_fpath, index_col=None) # ============================================ # setup objects serialID_to_entityID = get_serialID_to_entityID() record_class.__setup_embedding__(embedding_data_path, serialID_to_entityID, _normalize=True) emb_dim = record_class.embedding['HSCode'].shape[1] # main_data_df has the records with entity ids main_data_df = pd.concat([anom_pos_df, anom_neg_df], axis=0) main_data_df = utils.convert_to_UnSerializedID_format(main_data_df, DIR) # ------------------------------------------- obj_list = [] for i in tqdm(range(anom_neg_df.shape[0])): obj = record_class(anom_neg_df.iloc[i].to_dict(), -1) obj_list.append(obj) for i in tqdm(range(anom_pos_df.shape[0])): obj = record_class(anom_pos_df.iloc[i].to_dict(), 1) obj_list.append(obj) # Read in the explantions with open(explantions_file_path, 'rb') as fh: explanations = json.load(fh) explanations = {int(k): [sorted(_) for _ in v] for k, v in explanations.items()} num_domains = len(domain_dims) domain_idx = {e[0]: e[1] for e in enumerate(domain_dims.keys())} domainInteraction_index = {} k = 0 for i in range(num_domains): for j in range(i + 1, num_domains): domainInteraction_index['_'.join((domain_idx[i], domain_idx[j]))] = k k += 1 data_x = [] data_id = [] data_label = [] data_ID_to_matrix = {} for _obj in obj_list: data_x.append(_obj.x) data_id.append(_obj.id) data_label.append(_obj.label) data_ID_to_matrix[_obj.id] = _obj.x data_x = np.stack(data_x) data_label = np.array(data_label) data_id = np.array(data_id) idx = np.arange(len(data_id), dtype=int) np.random.shuffle(idx) data_x = data_x[idx] data_label = data_label[idx] data_id = data_id[idx] X_0 = data_x # Relevant anomalies X_1 = obtain_normal_samples() # Nominal y_0 = np.ones(X_0.shape[0]) y_1 = -1 * np.ones(X_1.shape[0]) y = np.hstack([y_0, y_1]) X = np.vstack([X_0, X_1]) num_coeff = len(domainInteraction_index) classifier_obj = get_trained_classifier(X, y, num_domains, emb_dim) W = classifier_obj.W.cpu().data.numpy() emb_dim = W.shape[-1] # classifier_obj.predict_score_op(X_0) # Create a reference dataframe :: data_reference_df data_reference_df = pd.DataFrame( data=np.vstack([data_id, data_label]).transpose(), columns=['PanjivaRecordID', 'label'] ) data_reference_df['baseID'] = data_reference_df['PanjivaRecordID'].apply(lambda x: str(x)[:-3]) data_reference_df['expl_1'] = -1 data_reference_df['expl_2'] = -1 data_reference_df['original_score'] = 1 for i, row in data_reference_df.iterrows(): _id = int(row['PanjivaRecordID']) if _id in explanations.keys(): entry = explanations[_id] domain_1 = entry[0][0] domain_2 = entry[0][1] data_reference_df.loc[i, 'expl_1'] = domainInteraction_index['_'.join(sorted([domain_1, domain_2]))] domain_1 = entry[1][0] domain_2 = entry[1][1] data_reference_df.loc[i, 'expl_2'] = domainInteraction_index['_'.join(sorted([domain_1, domain_2]))] _x = data_ID_to_matrix[_id] data_reference_df.loc[i, 'original_score'] = classifier_obj.predict_score_op(np.array([_x]))[0] data_reference_df['cur_score'] = data_reference_df['original_score'].values # To get random results # Randomization cur_df = data_reference_df.copy() cur_df = cur_df.sample(frac=1).reset_index(drop=True) cur_df = shuffle(cur_df).reset_index(drop=True) check_next_values = [10, 20, 30, 40, 50] next_K_precision_wI, recall_wI = execute_with_input( clf_obj=copy.deepcopy(classifier_obj), working_df=cur_df, ref_data_df=main_data_df, domainInteraction_index=domainInteraction_index, num_coeff=num_coeff, emb_dim=emb_dim, data_ID_to_matrix=data_ID_to_matrix, check_next_values=check_next_values, batch_size=feedback_batch_size ) next_K_precision_nI, recall_nI = execute_without_input( working_df=cur_df, batch_size=feedback_batch_size ) def aux_create_result_df( next_K_precision, recall, check_next_values ): next_K_precision = np.array(next_K_precision) recall = np.array(recall) idx = np.arange(max(recall.shape[0], next_K_precision.shape[0])) # fill in zeros if next_K_precision.shape[0] < idx.shape[0]: pad = np.zeros([idx.shape[0] - next_K_precision.shape[0], next_K_precision.shape[1]]) next_K_precision = np.concatenate([next_K_precision,pad],axis=0) columns = ['idx'] + ['recall'] + ['Prec@next_' + str(_) for _ in check_next_values] _data = np.concatenate([ idx.reshape([-1, 1]), recall.reshape([-1, 1]), next_K_precision, ], axis=-1) result_df = pd.DataFrame( _data, columns=columns ) return result_df results_with_input = aux_create_result_df(next_K_precision_wI, recall_wI, check_next_values) results_no_input = aux_create_result_df(next_K_precision_nI, recall_nI, check_next_values) return results_with_input, results_no_input