def get_inputs(inp_dir, dataset, embed, embed_mode, mode, layer): """ Read data from pkl file and prepare for training. """ file = open( inp_dir + dataset + '-' + embed + '-' + embed_mode + '-' + mode + '.pkl', 'rb') data = pickle.load(file) orders, data_x, data_y = list(zip(*data)) file.close() # alphaW is responsible for which BERT layer embedding we will be using if (layer == 'all'): alphaW = np.full([n_hl], 1 / n_hl) else: alphaW = np.zeros([n_hl]) alphaW[int(layer) - 1] = 1 # just changing the way data is stored (tuples of minibatches) and getting the output for the required layer of BERT using alphaW inputs = [] targets = [] author_ids = [] n_batches = len(data_y) print(len(orders)) for ii in range(n_batches): inputs.extend(np.einsum('k,kij->ij', alphaW, data_x[ii])) targets.extend(data_y[ii]) author_ids.extend(orders[ii]) print('inputs shape: ', np.array(inputs).shape) print('author_ids shape: ', np.array(author_ids).shape) inputs = pd.DataFrame(np.array(inputs)) inputs['order'] = author_ids inputs = inputs.set_index(['order']) full_targets = pd.DataFrame(np.array(targets)) full_targets['order'] = author_ids full_targets = full_targets.set_index(['order']) if dataset == 'essays': dump_data = dataset_processors.load_essays_df( '../data/essays/essays.csv') trait_labels = ['EXT', 'NEU', 'AGR', 'CON', 'OPN'] elif dataset == 'kaggle': dump_data = dataset_processors.load_Kaggle_df( '../data/kaggle/kaggle.csv') trait_labels = ['E', 'N', 'F', 'J'] _, _, _, other_features_df = feature_utils.get_psycholinguist_data( dump_data, dataset, feature_flags) inputs, full_targets = merge_features(inputs, other_features_df, trait_labels) return inputs, full_targets, trait_labels
def get_inputs(inp_dir, dataset, embed, embed_mode, mode, layer): """Read data from pkl file and prepare for training.""" file = open( inp_dir + dataset + "-" + embed + "-" + embed_mode + "-" + mode + ".pkl", "rb" ) data = pickle.load(file) orders, data_x, data_y = list(zip(*data)) file.close() # alphaW is responsible for which BERT layer embedding we will be using if layer == "all": alphaW = np.full([n_hl], 1 / n_hl) else: alphaW = np.zeros([n_hl]) alphaW[int(layer) - 1] = 1 # just changing the way data is stored (tuples of minibatches) and getting the output for the required layer of BERT using alphaW inputs = [] targets = [] author_ids = [] n_batches = len(data_y) print(len(orders)) for ii in range(n_batches): inputs.extend(np.einsum("k,kij->ij", alphaW, data_x[ii])) targets.extend(data_y[ii]) author_ids.extend(orders[ii]) print("inputs shape: ", np.array(inputs).shape) print("author_ids shape: ", np.array(author_ids).shape) inputs = pd.DataFrame(np.array(inputs)) inputs["order"] = author_ids inputs = inputs.set_index(["order"]) full_targets = pd.DataFrame(np.array(targets)) full_targets["order"] = author_ids full_targets = full_targets.set_index(["order"]) if dataset == "essays": dump_data = dataset_processors.load_essays_df("../data/essays/essays.csv") trait_labels = ["EXT", "NEU", "AGR", "CON", "OPN"] elif dataset == "kaggle": dump_data = dataset_processors.load_Kaggle_df("../data/kaggle/kaggle.csv") trait_labels = ["E", "N", "F", "J"] _, _, _, other_features_df = feature_utils.get_psycholinguist_data( dump_data, dataset, feature_flags ) inputs, full_targets = merge_features(inputs, other_features_df, trait_labels) return inputs, full_targets, trait_labels
def get_inputs(dataset): """Read data from metafeature files and prepare for training.""" if dataset == "essays": dump_data = dataset_processors.load_essays_df( "../data/essays/essays.csv") trait_labels = ["EXT", "NEU", "AGR", "CON", "OPN"] elif dataset == "kaggle": dump_data = dataset_processors.load_Kaggle_df( "../data/kaggle/kaggle.csv") trait_labels = ["E", "N", "F", "J"] print("dataset loaded! Getting psycholinguistic features...") inputs, full_targets, _, _ = feature_utils.get_psycholinguist_data( dump_data, dataset, feature_flags) inputs = np.array(inputs) full_targets = np.array(full_targets) return inputs, full_targets, trait_labels
def get_inputs(dataset): """ Read data from metafeature files and prepare for training. """ if dataset == 'essays': dump_data = dataset_processors.load_essays_df( '../data/essays/essays.csv') trait_labels = ['EXT', 'NEU', 'AGR', 'CON', 'OPN'] elif dataset == 'kaggle': dump_data = dataset_processors.load_Kaggle_df( '../data/kaggle/kaggle.csv') trait_labels = ['E', 'N', 'F', 'J'] print('dataset loaded! Getting psycholinguistic features...') inputs, full_targets, _, _ = feature_utils.get_psycholinguist_data( dump_data, dataset, feature_flags) inputs = np.array(inputs) full_targets = np.array(full_targets) return inputs, full_targets, trait_labels
print(network) nrc, nrc_vad, readability, mairesse = [True, True, True, True] feature_flags = [nrc, nrc_vad, readability, mairesse] np.random.seed(jobid) tf.random.set_seed(jobid) start = time.time() if __name__ == "__main__": if dataset == 'essays': dump_data = dataset_processors.load_essays_df('data/essays/essays.csv') trait_labels = ['EXT', 'NEU', 'AGR', 'CON', 'OPN'] elif dataset == 'kaggle': dump_data = dataset_processors.load_Kaggle_df('data/kaggle/kaggle.csv') trait_labels = ['E', 'N', 'F', 'J'] print('dataset loaded! Getting psycholinguistic features...') inputs, full_targets, feature_names, _ = feature_utils.get_psycholinguist_data( dump_data, dataset, feature_flags) inputs = np.array(inputs) full_targets = np.array(full_targets) print(inputs.shape) print(full_targets.shape) print(feature_names) print('starting k-fold cross validation...') n_splits = 10 fold_acc = {} expdata = {}