Example #1
0
def get_inputs(inp_dir, dataset, embed, embed_mode, mode, layer):
    """ Read data from pkl file and prepare for training. """
    file = open(
        inp_dir + dataset + '-' + embed + '-' + embed_mode + '-' + mode +
        '.pkl', 'rb')
    data = pickle.load(file)
    orders, data_x, data_y = list(zip(*data))
    file.close()

    # alphaW is responsible for which BERT layer embedding we will be using
    if (layer == 'all'):
        alphaW = np.full([n_hl], 1 / n_hl)

    else:
        alphaW = np.zeros([n_hl])
        alphaW[int(layer) - 1] = 1

    # just changing the way data is stored (tuples of minibatches) and getting the output for the required layer of BERT using alphaW
    inputs = []
    targets = []
    author_ids = []

    n_batches = len(data_y)
    print(len(orders))

    for ii in range(n_batches):
        inputs.extend(np.einsum('k,kij->ij', alphaW, data_x[ii]))
        targets.extend(data_y[ii])
        author_ids.extend(orders[ii])

    print('inputs shape: ', np.array(inputs).shape)
    print('author_ids shape: ', np.array(author_ids).shape)

    inputs = pd.DataFrame(np.array(inputs))
    inputs['order'] = author_ids
    inputs = inputs.set_index(['order'])
    full_targets = pd.DataFrame(np.array(targets))
    full_targets['order'] = author_ids
    full_targets = full_targets.set_index(['order'])

    if dataset == 'essays':
        dump_data = dataset_processors.load_essays_df(
            '../data/essays/essays.csv')
        trait_labels = ['EXT', 'NEU', 'AGR', 'CON', 'OPN']

    elif dataset == 'kaggle':
        dump_data = dataset_processors.load_Kaggle_df(
            '../data/kaggle/kaggle.csv')
        trait_labels = ['E', 'N', 'F', 'J']

    _, _, _, other_features_df = feature_utils.get_psycholinguist_data(
        dump_data, dataset, feature_flags)
    inputs, full_targets = merge_features(inputs, other_features_df,
                                          trait_labels)

    return inputs, full_targets, trait_labels
Example #2
0
def get_inputs(inp_dir, dataset, embed, embed_mode, mode, layer):
    """Read data from pkl file and prepare for training."""
    file = open(
        inp_dir + dataset + "-" + embed + "-" + embed_mode + "-" + mode + ".pkl", "rb"
    )
    data = pickle.load(file)
    orders, data_x, data_y = list(zip(*data))
    file.close()

    # alphaW is responsible for which BERT layer embedding we will be using
    if layer == "all":
        alphaW = np.full([n_hl], 1 / n_hl)

    else:
        alphaW = np.zeros([n_hl])
        alphaW[int(layer) - 1] = 1

    # just changing the way data is stored (tuples of minibatches) and getting the output for the required layer of BERT using alphaW
    inputs = []
    targets = []
    author_ids = []

    n_batches = len(data_y)
    print(len(orders))

    for ii in range(n_batches):
        inputs.extend(np.einsum("k,kij->ij", alphaW, data_x[ii]))
        targets.extend(data_y[ii])
        author_ids.extend(orders[ii])

    print("inputs shape: ", np.array(inputs).shape)
    print("author_ids shape: ", np.array(author_ids).shape)

    inputs = pd.DataFrame(np.array(inputs))
    inputs["order"] = author_ids
    inputs = inputs.set_index(["order"])
    full_targets = pd.DataFrame(np.array(targets))
    full_targets["order"] = author_ids
    full_targets = full_targets.set_index(["order"])

    if dataset == "essays":
        dump_data = dataset_processors.load_essays_df("../data/essays/essays.csv")
        trait_labels = ["EXT", "NEU", "AGR", "CON", "OPN"]

    elif dataset == "kaggle":
        dump_data = dataset_processors.load_Kaggle_df("../data/kaggle/kaggle.csv")
        trait_labels = ["E", "N", "F", "J"]

    _, _, _, other_features_df = feature_utils.get_psycholinguist_data(
        dump_data, dataset, feature_flags
    )
    inputs, full_targets = merge_features(inputs, other_features_df, trait_labels)

    return inputs, full_targets, trait_labels
Example #3
0
def get_inputs(dataset):
    """Read data from metafeature files and prepare for training."""
    if dataset == "essays":
        dump_data = dataset_processors.load_essays_df(
            "../data/essays/essays.csv")
        trait_labels = ["EXT", "NEU", "AGR", "CON", "OPN"]
    elif dataset == "kaggle":
        dump_data = dataset_processors.load_Kaggle_df(
            "../data/kaggle/kaggle.csv")
        trait_labels = ["E", "N", "F", "J"]
    print("dataset loaded! Getting psycholinguistic features...")
    inputs, full_targets, _, _ = feature_utils.get_psycholinguist_data(
        dump_data, dataset, feature_flags)
    inputs = np.array(inputs)
    full_targets = np.array(full_targets)

    return inputs, full_targets, trait_labels
def get_inputs(dataset):
    """ Read data from metafeature files and prepare for training. """
    if dataset == 'essays':
        dump_data = dataset_processors.load_essays_df(
            '../data/essays/essays.csv')
        trait_labels = ['EXT', 'NEU', 'AGR', 'CON', 'OPN']
    elif dataset == 'kaggle':
        dump_data = dataset_processors.load_Kaggle_df(
            '../data/kaggle/kaggle.csv')
        trait_labels = ['E', 'N', 'F', 'J']
    print('dataset loaded! Getting psycholinguistic features...')
    inputs, full_targets, _, _ = feature_utils.get_psycholinguist_data(
        dump_data, dataset, feature_flags)
    inputs = np.array(inputs)
    full_targets = np.array(full_targets)

    return inputs, full_targets, trait_labels
Example #5
0
print(network)

nrc, nrc_vad, readability, mairesse = [True, True, True, True]
feature_flags = [nrc, nrc_vad, readability, mairesse]

np.random.seed(jobid)
tf.random.set_seed(jobid)

start = time.time()

if __name__ == "__main__":
    if dataset == 'essays':
        dump_data = dataset_processors.load_essays_df('data/essays/essays.csv')
        trait_labels = ['EXT', 'NEU', 'AGR', 'CON', 'OPN']
    elif dataset == 'kaggle':
        dump_data = dataset_processors.load_Kaggle_df('data/kaggle/kaggle.csv')
        trait_labels = ['E', 'N', 'F', 'J']
    print('dataset loaded! Getting psycholinguistic features...')
    inputs, full_targets, feature_names, _ = feature_utils.get_psycholinguist_data(
        dump_data, dataset, feature_flags)
    inputs = np.array(inputs)
    full_targets = np.array(full_targets)

    print(inputs.shape)
    print(full_targets.shape)
    print(feature_names)
    print('starting k-fold cross validation...')

    n_splits = 10
    fold_acc = {}
    expdata = {}