Esempio n. 1
0
def max_balancer(input_csv_path, output_csv_path='./output.csv'):
    dataset = csv_handler.csv_readlines(input_csv_path)

    pos_dataset = transform.filter_func(dataset, lambda row: row[2] == '1')
    neg_dataset = transform.filter_func(dataset, lambda row: row[2] == '0')

    assert (len(pos_dataset) <= len(neg_dataset))
    sampler = Sampler()
    neg_dataset = sampler.sample_rows(neg_dataset, len(pos_dataset))

    pos_ids = transform.map_func(pos_dataset, lambda row: row[0])
    neg_ids = transform.map_func(neg_dataset, lambda row: row[0])

    select_id_set = set(pos_ids + neg_ids)
    final = transform.filter_func(dataset, lambda row: row[0] in select_id_set)

    csv_handler.csv_writelines(output_csv_path, final)
Esempio n. 2
0
    def csv_split(self, percentage, first_output_path, second_output_path):
        assert (percentage > 0)
        assert (percentage < 1)
        first_size = percentage * len(self.dataset)
        first_idx = self.__sampled_idx(first_size)

        # first output
        first_dataset = transformer.map_func(first_idx,
                                             lambda idx: self.dataset[idx])
        csv_writelines(first_output_path, first_dataset)

        # second output
        first_idx_set = set(first_idx)
        second_idx = transformer.filter_func(
            range(len(self.dataset)), lambda idx: idx not in first_idx_set)
        second_dataset = transformer.map_func(second_idx,
                                              lambda idx: self.dataset[idx])
        csv_writelines(second_output_path, second_dataset)
Esempio n. 3
0
def select(split, dataset):
    final = transformer.filter_func(dataset, lambda row: row[3] == split)
    final = transformer.map_func(final, lambda row: (row[0], row[1], row[2]))
    return final
Esempio n. 4
0
train_id_path = data_dir + "train_ids.txt"
test_id_path = data_dir + "test_ids.txt"
anno_dir = data_dir + '/iclr_anno_final/'
train_output_path = data_dir + "./trainset.csv"
test_output_path = data_dir + "./devset.csv"

import csv_handler as csv_handler
train_id_dataset = open(train_id_path, "r").read().splitlines()

test_id_dataset = open(test_id_path, "r").read().splitlines()

from os import listdir
from os.path import isfile, join

files = listdir(anno_dir)
files = transformer.filter_func(files, lambda name: 'rating' in name)


def get_id(file_name):
    idx = file_name.index("rating")
    return file_name[:idx - 1]


# check completeness
anno_ids = transformer.map_func(files, lambda file_name: get_id(file_name))
anno_ids.sort()

train_test_ids = train_id_dataset + test_id_dataset
train_test_ids.sort()
assert (anno_ids == train_test_ids)
Esempio n. 5
0
# get idx for review_id, text, and funny_count
def get_triplet_idx(header):
    idx_id = header.index('review_id')
    idx_text = header.index('text')
    idx_count = header.index('funny')
    return (idx_id, idx_text, idx_count)


def selector(row, idx_id, idx_text, idx_count):
    r_id = row[idx_id]
    text = row[idx_text]
    funny_count = int(row[idx_count])
    is_funny = None
    if (funny_count >= 5):
        is_funny = 1
    elif (funny_count == 0):
        is_funny = 0
    return (r_id, text, is_funny)


(idx_id, idx_text, idx_count) = get_triplet_idx(dataset[0])

selected_datasets = transformer.map_func(
    dataset[1:], lambda line: selector(line, idx_id, idx_text, idx_count))

final_datasets = transformer.filter_func(selected_datasets,
                                         lambda row: row[2] != None)

csv_handler.csv_writelines(review_output_path, final_datasets)
Esempio n. 6
0
def distill(input_path, label_map):
    dataset = csv_handler.csv_readlines(input_path, delimit='\t')
    dataset = transform.filter_func(dataset, lambda row: len(row) >= 5)
    token_label = transform.map_func(
        dataset, lambda row: [row[0].strip(), label_map[row[4].strip()[0]]])
    return token_label