Esempio n. 1
0
def get_data():
    global anomalies_pos_fpath
    global anomalies_neg_fpath
    global domain_dims
    global explantions_file_path
    global embedding_data_path
    global serialID_mapping_loc

    # ============================================

    anom_pos_df = pd.read_csv(anomalies_pos_fpath, index_col=None)
    anom_neg_df = pd.read_csv(anomalies_neg_fpath, index_col=None)
    serialID_to_entityID = get_serialID_to_entityID()
    print('Setting up record class embedding...', embedding_data_path)
    record_class.__setup_embedding__(embedding_data_path,
                                     serialID_to_entityID,
                                     _normalize=True)
    main_data_df = pd.concat([anom_pos_df, anom_neg_df], axis=0)
    # main_data_df has the records with entity ids

    obj_list = []
    for i in tqdm(range(anom_neg_df.shape[0])):
        obj = record_class(anom_neg_df.iloc[i].to_dict(), -1)
        obj.calc_features()
        obj_list.append(obj)

    for i in tqdm(range(anom_pos_df.shape[0])):
        obj = record_class(anom_pos_df.iloc[i].to_dict(), 1)
        obj.calc_features()
        obj_list.append(obj)

    # Read in the explantions
    with open(explantions_file_path, 'rb') as fh:
        explanations = json.load(fh)

    explanations = {
        int(k): [sorted(_) for _ in v]
        for k, v in explanations.items()
    }
    data_x = []
    data_x_features = []
    data_id = []
    data_label = []
    data_ID_to_matrix = {}

    for _obj in obj_list:
        data_x.append(_obj.x)
        data_id.append(_obj.id)
        data_label.append(_obj.label)
        data_ID_to_matrix[_obj.id] = _obj.features
        data_x_features.append(_obj.features)
    data_x = np.stack(data_x)
    data_label = np.array(data_label)
    data_id = np.array(data_id)
    return main_data_df, explanations, data_id, data_x, data_label, data_x_features, data_ID_to_matrix
Esempio n. 2
0
def obtain_normal_samples():
    global test_data_serialized_loc
    normal_data = pd.read_csv(test_data_serialized_loc, index_col=None)

    _df = normal_data.sample(5000)
    obj_list = []
    for i in tqdm(range(_df.shape[0])):
        obj = record_class(_df.iloc[i].to_dict(), -1)
        obj_list.append(obj)
    data_x = []
    for _obj in obj_list:
        data_x.append(_obj.x)
    data_x = np.stack(data_x)
    return data_x
Esempio n. 3
0
def main_executor():
    global explantions_file_path
    global embedding_data_path
    global serialID_mapping_loc
    global anomalies_pos_fpath
    global anomalies_neg_fpath
    global domain_dims
    global test_data_serialized_loc
    global feedback_batch_size
    global top_K_count
    # ============================================
    anom_pos_df = pd.read_csv(anomalies_pos_fpath, index_col=None)
    anom_neg_df = pd.read_csv(anomalies_neg_fpath, index_col=None)
    # ============================================
    # setup objects

    serialID_to_entityID = get_serialID_to_entityID()
    record_class.__setup_embedding__(embedding_data_path,
                                     serialID_to_entityID,
                                     _normalize=True)
    emb_dim = record_class.embedding['HSCode'].shape[1]

    # -------------------------------------------
    obj_list = []
    for i in tqdm(range(anom_neg_df.shape[0])):
        obj = record_class(anom_neg_df.iloc[i].to_dict(), -1)
        obj_list.append(obj)

    for i in tqdm(range(anom_pos_df.shape[0])):
        obj = record_class(anom_pos_df.iloc[i].to_dict(), 1)
        obj_list.append(obj)

    print(explantions_file_path)
    print(os.getcwd())
    # Read in the explantions
    with open(explantions_file_path, 'rb') as fh:
        explanations = json.load(fh)
    explanations = {
        int(k): [sorted(_) for _ in v]
        for k, v in explanations.items()
    }

    num_domains = len(domain_dims)
    domain_idx = {e[0]: e[1] for e in enumerate(domain_dims.keys())}
    domain_list = list(domain_dims.keys())
    domainInteraction_index = {}
    k = 0
    for i in range(num_domains):
        for j in range(i + 1, num_domains):
            domainInteraction_index['_'.join(
                (domain_idx[i], domain_idx[j]))] = k
            k += 1

    data_x = []
    data_id = []
    data_label = []
    data_ID_to_matrix = {}
    for _obj in obj_list:
        data_x.append(_obj.x)
        data_id.append(_obj.id)
        data_label.append(_obj.label)
        data_ID_to_matrix[_obj.id] = _obj.x
    data_x = np.stack(data_x)
    data_label = np.array(data_label)
    data_id = np.array(data_id)

    idx = np.arange(len(data_id), dtype=int)
    np.random.shuffle(idx)

    data_x = data_x[idx]
    data_label = data_label[idx]
    data_id = data_id[idx]

    X_0 = data_x  # Relevant anomalies
    X_1 = obtain_normal_samples()  # Nominal
    y_0 = np.ones(X_0.shape[0])
    y_1 = -1 * np.ones(X_1.shape[0])
    y = np.hstack([y_0, y_1])
    X = np.vstack([X_0, X_1])
    num_coeff = len(domainInteraction_index)
    classifier_obj = get_trained_classifier(X, y, num_domains, emb_dim)
    W = classifier_obj.W.cpu().data.numpy()
    emb_dim = W.shape[-1]

    # classifier_obj.predict_score_op(X_0)
    # Create a referece dataframe  :: data_reference_df
    working_df = pd.DataFrame(data=np.vstack([data_id,
                                              data_label]).transpose(),
                              columns=['PanjivaRecordID', 'label'])
    working_df['baseID'] = working_df['PanjivaRecordID'].apply(
        lambda x: str(x)[:-3])
    working_df['expl_1'] = -1
    working_df['expl_2'] = -1
    working_df['original_score'] = 1

    for i, row in working_df.iterrows():
        _id = int(row['PanjivaRecordID'])
        if _id in explanations.keys():
            entry = explanations[_id]
            domain_1 = entry[0][0]
            domain_2 = entry[0][1]
            working_df.loc[i, 'expl_1'] = domainInteraction_index['_'.join(
                sorted([domain_1, domain_2]))]
            domain_1 = entry[1][0]
            domain_2 = entry[1][1]
            working_df.loc[i, 'expl_2'] = domainInteraction_index['_'.join(
                sorted([domain_1, domain_2]))]
        _x = data_ID_to_matrix[_id]
        working_df.loc[i, 'original_score'] = classifier_obj.predict_score_op(
            np.array([_x]))[0]

    working_df['cur_score'] = working_df['original_score'].values
    data_reference_df = working_df.copy()

    # To get random results
    results_with_input = pd.DataFrame(columns=['idx', 'acc'])
    results_no_input = pd.DataFrame(columns=['idx', 'acc'])

    cur_df = data_reference_df.copy()
    # Randomization
    cur_df = cur_df.sample(frac=1).reset_index(drop=True)
    acc = execute_with_input(clf_obj=copy.deepcopy(classifier_obj),
                             working_df=cur_df,
                             domainInteraction_index=domainInteraction_index,
                             num_coeff=num_coeff,
                             emb_dim=emb_dim,
                             data_ID_to_matrix=data_ID_to_matrix,
                             check_next=top_K_count,
                             batch_size=feedback_batch_size)
    _tmpdf = pd.DataFrame([(e[0], e[1]) for e in enumerate(acc)],
                          columns=['idx', 'acc'])
    results_with_input = results_with_input.append(_tmpdf, ignore_index=True)

    acc = execute_without_input(working_df=cur_df,
                                check_next=top_K_count,
                                batch_size=feedback_batch_size)
    _tmpdf = pd.DataFrame([(e[0], e[1]) for e in enumerate(acc)],
                          columns=['idx', 'acc'])
    results_no_input = results_no_input.append(_tmpdf, ignore_index=True)
    return results_with_input, results_no_input
Esempio n. 4
0
# ============================================
anom_pos_df = pd.read_csv(anomalies_pos_fpath, index_col=None)
anom_neg_df = pd.read_csv(anomalies_neg_fpath, index_col=None)
# ============================================
# setup objects



serialID_to_entityID = get_serialID_to_entityID()
record_class.__setup_embedding__(embedding_data_path, serialID_to_entityID, _normalize=True)
emb_dim = record_class.embedding['HSCode'].shape[1]

# -------------------------------------------
obj_list = []
for i in tqdm(range(anom_neg_df.shape[0])):
    obj = record_class(anom_neg_df.iloc[i].to_dict(),-1)
    obj_list.append(obj)
    
for i in tqdm(range(anom_pos_df.shape[0])):
    obj = record_class(anom_pos_df.iloc[i].to_dict(),1)
    obj_list.append(obj)

# Read in the explantions
with open(explantions_file_path,'rt') as fh:
    explanations = json.load(fh)
explanations = { int(k): [sorted (_) for _ in v] for k,v in explanations.items()}

num_domains = len(domain_dims)
domain_idx = { e[0]:e[1] for e in enumerate(domain_dims.keys())}
domain_list = list(domain_dims.keys())
domainInteraction_index = {}
Esempio n. 5
0
def main_executor():
    global explantions_file_path
    global embedding_data_path
    global serialID_mapping_loc
    global anomalies_pos_fpath
    global anomalies_neg_fpath
    global domain_dims
    global test_data_serialized_loc
    global feedback_batch_size
    global DIR

    # ============================================

    anom_pos_df = pd.read_csv(anomalies_pos_fpath, index_col=None)
    anom_neg_df = pd.read_csv(anomalies_neg_fpath, index_col=None)

    # ============================================
    # setup objects

    serialID_to_entityID = get_serialID_to_entityID()
    record_class.__setup_embedding__(embedding_data_path, serialID_to_entityID, _normalize=True)
    emb_dim = record_class.embedding['HSCode'].shape[1]

    # main_data_df has the records with entity ids
    main_data_df = pd.concat([anom_pos_df, anom_neg_df], axis=0)
    main_data_df = utils.convert_to_UnSerializedID_format(main_data_df, DIR)
    # -------------------------------------------
    obj_list = []
    for i in tqdm(range(anom_neg_df.shape[0])):
        obj = record_class(anom_neg_df.iloc[i].to_dict(), -1)
        obj_list.append(obj)

    for i in tqdm(range(anom_pos_df.shape[0])):
        obj = record_class(anom_pos_df.iloc[i].to_dict(), 1)
        obj_list.append(obj)

    # Read in the explantions
    with open(explantions_file_path, 'rb') as fh:
        explanations = json.load(fh)

    explanations = {int(k): [sorted(_) for _ in v] for k, v in explanations.items()}
    num_domains = len(domain_dims)
    domain_idx = {e[0]: e[1] for e in enumerate(domain_dims.keys())}

    domainInteraction_index = {}
    k = 0
    for i in range(num_domains):
        for j in range(i + 1, num_domains):
            domainInteraction_index['_'.join((domain_idx[i], domain_idx[j]))] = k
            k += 1

    data_x = []
    data_id = []
    data_label = []
    data_ID_to_matrix = {}

    for _obj in obj_list:
        data_x.append(_obj.x)
        data_id.append(_obj.id)
        data_label.append(_obj.label)
        data_ID_to_matrix[_obj.id] = _obj.x

    data_x = np.stack(data_x)
    data_label = np.array(data_label)
    data_id = np.array(data_id)

    idx = np.arange(len(data_id), dtype=int)
    np.random.shuffle(idx)

    data_x = data_x[idx]
    data_label = data_label[idx]
    data_id = data_id[idx]

    X_0 = data_x  # Relevant anomalies
    X_1 = obtain_normal_samples()  # Nominal
    y_0 = np.ones(X_0.shape[0])
    y_1 = -1 * np.ones(X_1.shape[0])
    y = np.hstack([y_0, y_1])
    X = np.vstack([X_0, X_1])
    num_coeff = len(domainInteraction_index)
    classifier_obj = get_trained_classifier(X, y, num_domains, emb_dim)
    W = classifier_obj.W.cpu().data.numpy()
    emb_dim = W.shape[-1]

    # classifier_obj.predict_score_op(X_0)
    # Create a reference dataframe  :: data_reference_df
    data_reference_df = pd.DataFrame(
        data=np.vstack([data_id, data_label]).transpose(),
        columns=['PanjivaRecordID', 'label']
    )

    data_reference_df['baseID'] = data_reference_df['PanjivaRecordID'].apply(lambda x: str(x)[:-3])
    data_reference_df['expl_1'] = -1
    data_reference_df['expl_2'] = -1
    data_reference_df['original_score'] = 1

    for i, row in data_reference_df.iterrows():
        _id = int(row['PanjivaRecordID'])
        if _id in explanations.keys():
            entry = explanations[_id]
            domain_1 = entry[0][0]
            domain_2 = entry[0][1]
            data_reference_df.loc[i, 'expl_1'] = domainInteraction_index['_'.join(sorted([domain_1, domain_2]))]
            domain_1 = entry[1][0]
            domain_2 = entry[1][1]
            data_reference_df.loc[i, 'expl_2'] = domainInteraction_index['_'.join(sorted([domain_1, domain_2]))]
        _x = data_ID_to_matrix[_id]
        data_reference_df.loc[i, 'original_score'] = classifier_obj.predict_score_op(np.array([_x]))[0]

    data_reference_df['cur_score'] = data_reference_df['original_score'].values

    # To get random results
    # Randomization
    cur_df = data_reference_df.copy()
    cur_df = cur_df.sample(frac=1).reset_index(drop=True)
    cur_df = shuffle(cur_df).reset_index(drop=True)
    check_next_values = [10, 20, 30, 40, 50]
    next_K_precision_wI, recall_wI = execute_with_input(
        clf_obj=copy.deepcopy(classifier_obj),
        working_df=cur_df,
        ref_data_df=main_data_df,
        domainInteraction_index=domainInteraction_index,
        num_coeff=num_coeff,
        emb_dim=emb_dim,
        data_ID_to_matrix=data_ID_to_matrix,
        check_next_values=check_next_values,
        batch_size=feedback_batch_size
    )

    next_K_precision_nI, recall_nI = execute_without_input(
        working_df=cur_df,
        batch_size=feedback_batch_size
    )

    def aux_create_result_df(
            next_K_precision, recall, check_next_values
    ):

        next_K_precision = np.array(next_K_precision)
        recall = np.array(recall)
        idx = np.arange(max(recall.shape[0], next_K_precision.shape[0]))

        # fill in zeros
        if next_K_precision.shape[0] < idx.shape[0]:
            pad = np.zeros([idx.shape[0] - next_K_precision.shape[0], next_K_precision.shape[1]])
            next_K_precision = np.concatenate([next_K_precision,pad],axis=0)

        columns = ['idx'] + ['recall'] + ['Prec@next_' + str(_) for _ in check_next_values]
        _data = np.concatenate([
            idx.reshape([-1, 1]),
            recall.reshape([-1, 1]),
            next_K_precision,
        ], axis=-1)
        result_df = pd.DataFrame(
            _data, columns=columns
        )
        return result_df

    results_with_input = aux_create_result_df(next_K_precision_wI, recall_wI, check_next_values)
    results_no_input = aux_create_result_df(next_K_precision_nI, recall_nI, check_next_values)
    return results_with_input, results_no_input