Exemple #1
0
    def predict(self, all_texts, batch_size=32, text_col=1):

        if type(all_texts[0]) == tuple or type(all_texts[0]) == list:
            all_texts = transform.map_func(all_texts,
                                           lambda row: row[text_col])

        output = []
        i = 0

        while i < len(all_texts):
            print("start processing {} / {}".format(i, len(all_texts)))
            batch_texts = all_texts[i:(i + batch_size)]
            examples = self.__get_examples(batch_texts)
            pred = self.__predict_batch(examples)
            pred_and_class = self.__get_class_from_pred(pred)

            assert (len(pred_and_class) == len(batch_texts))
            transform.map_func(
                range(len(pred_and_class)),
                lambda idx: pred_and_class[idx].append(batch_texts[idx]))

            output += pred_and_class

            i += batch_size

        return output
Exemple #2
0
    def __get_class_from_pred(self, pred):
        result = pred[1]
        result = result.detach().cpu().numpy()

        label = numpy.argmax(result, axis = 1)

        result = result.tolist()
        label = label.tolist()

        transform.map_func(range(len(result)), lambda idx : result[idx].append(label[idx]))

        return result
Exemple #3
0
def bert_estimate(input_path, text_col, output_path, model_dir, gpu,
                  with_header):

    # assign GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = gpu

    # load model
    if model_dir == '':
        model_dir = os.path.join(os.environ['FACTMINE_HOME'],
                                 'cache/company_model')

    print("Loading model from %s ...\n" % (model_dir))

    model = BertModel(model_dir)

    print("Calculating negative score, positive score, and argmax class...\n")

    dataset = csv_handler.csv_readlines(input_path)
    header = None
    if with_header == True:
        header = dataset[0]
        dataset = dataset[1:]

    text_col = text_col - 1
    texts = transform.map_func(dataset, lambda row: row[text_col])

    preds = model.predict(texts)

    assert (len(dataset) == len(preds))

    # apply sigmoid
    preds = transform.map_func(
        preds, lambda quad:
        [util.sigmoid(quad[0]),
         util.sigmoid(quad[1]), quad[2], quad[3]])

    for i in range(len(dataset)):
        dataset[i].append(preds[i][0])
        dataset[i].append(preds[i][1])
        dataset[i].append(preds[i][2])

    if with_header == True:
        header.append('score_0')
        header.append('score_1')
        header.append('arg_class')
        dataset = [header] + dataset

    csv_handler.csv_writelines(output_path, dataset)

    if output_path != "":
        print("Finished, Results are ready at %s " % (output_path))
Exemple #4
0
def extract(data, typename):
    dataset = csv_handler.csv_readlines("./dataset/" + data + "set.csv")
    dataset = transformer.indexleft_func(dataset)
    dataset = transformer.map_func(dataset, lambda row:
                                   (row[0], row[1][1], row[1][2]))
    output_path = "./" + data + ".csv"

    def e_func(triplet):
        label = 0
        if triplet[2] == typename:
            label = 1
        return (triplet[0], triplet[1], label)

    final = transformer.map_func(dataset, lambda triplet: e_func(triplet))
    csv_handler.csv_writelines(output_path, final)
Exemple #5
0
    def project_batch(self, texts):

        batch_tokens = transform.map_func(texts, lambda text: self.__tokenize(text, pad_to_max_length=True))

        batch_tensor = torch.tensor(batch_tokens).to(self.device)

        self.model.eval()
        output = self.model(batch_tensor)

        token_embeddings = output[0].detach().cpu().numpy()

        result = transform.map_func(token_embeddings, lambda row : row[0])

        #result = output[0][0][0].detach().cpu().numpy()

        return result
Exemple #6
0
def finetune(input_path, output_model_dir, text_col, label_col, model_dir, gpu, with_header):
    '''Train a new model or finetune an existing model with labels, output fine-tuned model'''

    # assign GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = gpu

    dataset = csv_handler.csv_readlines(input_path)
    header = None
    if with_header == True:
        header = dataset[0]
        dataset = dataset[1:]

    print("Loading source model from %s ...\n" % (model_dir))
    model = BertModel(model_dir)

    text_col = text_col - 1
    label_col = label_col - 1

    labels = transform.map_func(range(len(dataset)), lambda i : [i, dataset[i][text_col], dataset[i][label_col]])

    print("Fine-tuning with input labels")
    model.train(labels)

    model.checkpoint(output_model_dir)

    print("Finished. Fine-tuned model is ready at " + output_model_dir)
Exemple #7
0
def log_to_csv_with_auc_accuracy(y_true, y_pred, y_score, csv_log_file_path, identity_info="dataset"):
    labels = [0, 1]
    result = precision_recall_fscore_support(y_true, y_pred)

    row = []
    row.append(identity_info)

    # neg
    row.append('label ' + str(labels[0]) + ":")
    row.append(result[0][0])
    row.append(result[1][0])
    row.append(result[2][0])
    row.append(result[3][0])

    row.append(' ')
    
    # pos
    row.append('label ' + str(labels[1]) + ":")
    row.append(result[0][1])
    row.append(result[1][1])
    row.append(result[2][1])
    row.append(result[3][1])

    row.append(' ')

    # auc and accuracy
    y_pos_score = transform.map_func(y_score, lambda p : p[1])
    auc = metrics.roc_auc_score(y_true, y_pos_score)
    row.append(auc)

    accuracy = metrics.accuracy_score(y_true, y_pred)
    row.append(accuracy)

    csv_handler.append_row(csv_log_file_path, row)
Exemple #8
0
 def sample_rows(self, dataset, num_samples):
     assert (num_samples < len(dataset))
     idx_set = self.distinct_ints(num_samples, 0, len(dataset) - 1)
     idx_set = list(idx_set)
     idx_set.sort()
     result = transformer.map_func(idx_set, lambda idx: dataset[idx])
     return result
Exemple #9
0
def max_balancer(input_csv_path, output_csv_path='./output.csv'):
    dataset = csv_handler.csv_readlines(input_csv_path)

    pos_dataset = transform.filter_func(dataset, lambda row: row[2] == '1')
    neg_dataset = transform.filter_func(dataset, lambda row: row[2] == '0')

    assert (len(pos_dataset) <= len(neg_dataset))
    sampler = Sampler()
    neg_dataset = sampler.sample_rows(neg_dataset, len(pos_dataset))

    pos_ids = transform.map_func(pos_dataset, lambda row: row[0])
    neg_ids = transform.map_func(neg_dataset, lambda row: row[0])

    select_id_set = set(pos_ids + neg_ids)
    final = transform.filter_func(dataset, lambda row: row[0] in select_id_set)

    csv_handler.csv_writelines(output_csv_path, final)
Exemple #10
0
def transform(input_path, output_path):
    dataset = csv_handler.csv_readlines(input_path)
    dataset = dataset[1:]

    dataset = transformer.map_func(
        range(len(dataset)), lambda i: (i, dataset[i][0], 1
                                        if dataset[i][1] == "True" else 0))
    csv_handler.csv_writelines(output_path, dataset)
Exemple #11
0
    def csv_split(self, percentage, first_output_path, second_output_path):
        assert (percentage > 0)
        assert (percentage < 1)
        first_size = percentage * len(self.dataset)
        first_idx = self.__sampled_idx(first_size)

        # first output
        first_dataset = transformer.map_func(first_idx,
                                             lambda idx: self.dataset[idx])
        csv_writelines(first_output_path, first_dataset)

        # second output
        first_idx_set = set(first_idx)
        second_idx = transformer.filter_func(
            range(len(self.dataset)), lambda idx: idx not in first_idx_set)
        second_dataset = transformer.map_func(second_idx,
                                              lambda idx: self.dataset[idx])
        csv_writelines(second_output_path, second_dataset)
Exemple #12
0
def extractor(anno_dir, id_to_file, paper_id):
    file_path = anno_dir + id_to_file[paper_id]
    label_sent_dataset = csv_handler.csv_readlines(file_path, delimit='\t')

    indexed_result = transformer.indexleft_func(label_sent_dataset)
    final = transformer.map_func(
        indexed_result, lambda p:
        (paper_id + "_" + str(p[0]), p[1][1], p[1][0]))
    return final
Exemple #13
0
def evaluate(input_path, col_true, col_pred, metric, output_path, with_header):
    '''evaluate the quality of predictions with a metric (f1 by default), and output the metric scores'''

    result = []
    dataset = csv_handler.csv_readlines(input_path)
    if with_header == True:
        dataset = dataset[1:]

    col_true = int(col_true) - 1
    col_pred = int(col_pred) - 1
    y_true = transform.map_func(dataset, lambda row : int(row[col_true]))
    y_pred = transform.map_func(dataset, lambda row : int(row[col_pred]))

    def check_validity(class_array):
        for cls in class_array:
            assert(cls == 0 or cls == 1)
    check_validity(y_true)
    check_validity(y_pred)

    support_set = {'f1', 'accuracy', 'cohen', 'quad'}
    if metric not in support_set:
        sys.exit('please specify a valid metric in terms of f1, accuracy, cohen, or quad (i.e. precision_recall_fscore_support)')
    elif metric == 'f1':
        result.append(['f1'])
        result.append([f1_score(y_true, y_pred)])
    elif metric == 'accuracy':
        result.append(['accuracy'])
        result.append([accuracy_score(y_true, y_pred)])
    elif metric == 'cohen':
        result.append([cohen_kappa_score(y_true, y_pred)]) 
    elif metric == 'quad':
        (precision, recall, fscore, support) = precision_recall_fscore_support(y_true, y_pred)
        result.append(['class', 'precision', 'recall', 'fscore', 'support'])
        result.append([0, precision[0], recall[0], fscore[0], support[0]])
        result.append([1, precision[1], recall[1], fscore[1], support[1]])

    csv_handler.csv_writelines(output_path, result)
Exemple #14
0
def extract(data, typename):
    dataset = csv_handler.csv_readlines("./dataset/" + data + "_raw.csv")
    output_path = "./" + data + ".csv"

    def func_1(triplet):
        label = 0
        if triplet[2] == typename:
            label = 1
        return (triplet[0], triplet[1], label)

    e_func = func_1

    def func_2(triplet):
        label = 1
        if triplet[2] == typename:
            label = 0
        return (triplet[0], triplet[1], label)

    if typename == 'NoArgument':
        e_func = func_2

    final = transformer.map_func(dataset, lambda triplet: e_func(triplet))
    csv_handler.csv_writelines(output_path, final)
Exemple #15
0
 def csv_shuf(self, num_samples, output_path):
     assert (len(self.dataset) >= num_samples)
     sampled_idx = self.__sampled_idx(num_samples)
     samples = transformer.map_func(sampled_idx,
                                    lambda idx: self.dataset[idx])
     csv_writelines(output_path, samples)
Exemple #16
0
def sep(dataset):
    sents = transformer.map_func(dataset, lambda triplet: triplet[1])
    labels = transformer.map_func(dataset, lambda triplet: (int)(triplet[2]))
    return (sents, labels)
Exemple #17
0
train_duration = train_finish_time - start_time
print("train time is " + str(train_finish_time - start_time))

print("predicting...")

predicted = text_clf.predict(X_dev)
predicted_proba = text_clf.predict_proba(X_dev)

assert (len(predicted_proba) == len(X_dev))
assert (len(X_dev) == len(y_dev))

print("logging...")

(precision, recall, fscore,
 support) = metrics.precision_recall_fscore_support(y_dev, predicted)

row = []
row.append(sys.argv[1])
row.append(precision[1])
row.append(recall[1])
row.append(fscore[1])

pos_predicted = transform.map_func(predicted_proba, lambda p: p[1])
auc = metrics.roc_auc_score(y_dev, pos_predicted)
row.append(auc)

accuracy = metrics.accuracy_score(y_dev, predicted)
row.append(accuracy)

csv_handler.append_row(log_file_path, row)
Exemple #18
0
root.tag
sents = []
for child in root:
    tid = child.attrib['id']
    sentence = ""

    num_words = len(child)
    for i in range(num_words):
        sentence += child[i].text
        if i < num_words - 2:
            sentence += " "
    sents.append((tid, sentence))

import csv_handler as csv_handler
golds = csv_handler.csv_readlines(gold_path, delimit='\t')

import transform as transformer
assert(len(sents) == len(golds))
for i in range(len(sents)):
    assert(sents[i][0] == golds[i][0])
final = transformer.map_func(range(len(sents)), lambda i : (sents[i][0], sents[i][1], golds[i][1]))

import csv_handler as csv_handler
class CSV_Split(csv_handler.CSV_Handler):
    def __init__(self, dataset, seed = 0):
        self.seed = seed
        self.dataset = dataset

splitter = CSV_Split(final)
splitter.csv_split(0.2, "dev.csv", "train.csv")
Exemple #19
0
def select(split, dataset):
    final = transformer.filter_func(dataset, lambda row: row[3] == split)
    final = transformer.map_func(final, lambda row: (row[0], row[1], row[2]))
    return final
Exemple #20
0
        examples = []
        for (i, txt) in enumerate(batch):
            guid = "%s" % (i)
            text_a = txt 
            label = self.dummy_label
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples
        

if __name__ == "__main__":

    data_path = sys.argv[1]
    model_dir = sys.argv[2]
    output_path = sys.argv[3]

    # load test dataset
    raw_dataset = csv_handler.csv_readlines(data_path)
    ids = transform.map_func(raw_dataset, lambda row : row[0])
    texts = transform.map_func(raw_dataset, lambda row : row[1])

    # load model
    model = BertModel(model_dir)

    pred = model.predict(texts, 100)

    assert(len(ids) == len(pred))
    output = transform.map_func(range(len(ids)), lambda idx : [ids[idx]] + pred[idx])
    csv_handler.csv_writelines(output_path, output)
    
Exemple #21
0
 def get_weights(self, targets):
     return transform.map_func(targets, lambda lb: self.weight_map[lb])
Exemple #22
0
# get idx for review_id, text, and funny_count
def get_triplet_idx(header):
    idx_id = header.index('review_id')
    idx_text = header.index('text')
    idx_count = header.index('funny')
    return (idx_id, idx_text, idx_count)


def selector(row, idx_id, idx_text, idx_count):
    r_id = row[idx_id]
    text = row[idx_text]
    funny_count = int(row[idx_count])
    is_funny = None
    if (funny_count >= 5):
        is_funny = 1
    elif (funny_count == 0):
        is_funny = 0
    return (r_id, text, is_funny)


(idx_id, idx_text, idx_count) = get_triplet_idx(dataset[0])

selected_datasets = transformer.map_func(
    dataset[1:], lambda line: selector(line, idx_id, idx_text, idx_count))

final_datasets = transformer.filter_func(selected_datasets,
                                         lambda row: row[2] != None)

csv_handler.csv_writelines(review_output_path, final_datasets)
Exemple #23
0
    y_pred_col = int(sys.argv[3])
    num_threds = int(sys.argv[4])
    csv_output_path = sys.argv[5]

    print_header = '1'
    if len(sys.argv) > 6:
        print_header = sys.argv[6]
    assert (print_header == '1' or print_header == '0')

    thred_method = "min_max_even"
    if os.path.exists(csv_output_path):
        os.remove(csv_output_path)

    csv_dataset = csv_handler.csv_readlines(csv_input_path)

    y_true = transform.map_func(csv_dataset, lambda row: int(row[y_true_col]))
    y_pred_score = transform.map_func(csv_dataset,
                                      lambda row: float(row[y_pred_col]))

    #y_pred_score = transform.map_func(y_pred_score, lambda score : 1 / (1 + math.exp(-score)))

    thred_col = []

    if thred_method == "min_max_even":
        thred_col = get_threds_by_min_max_even(y_pred_score, num_threds)

    else:  # sorted score even slot
        thred_col = get_threds_by_sorted_score_equal_length(
            y_pred_score, num_threds)

    if print_header == '1':
Exemple #24
0
 def __init__(self, csv_file_path):
     dataset = csv_handler.csv_readlines(csv_file_path)
     labels = transform.map_func(dataset, lambda t: t[2])
     self.weight_map = weight_class(labels)
Exemple #25
0
                                        delimit='\t',
                                        quoter=csv.QUOTE_NONE)
    records = records[1:]

    def row_functor(i, records):
        assert (i < len(records))
        row = records[i]

        rid = row[0] + "_" + str(i)
        sent = row[4]
        label = row[5]
        split = row[6]

        return (rid, sent, label, split)

    records = transformer.map_func(range(len(records)),
                                   lambda i: row_functor(i, records))

    print(len(records))
    dataset = dataset + records[1:]
print(len(dataset))
print(dataset[0])


def select(split, dataset):
    final = transformer.filter_func(dataset, lambda row: row[3] == split)
    final = transformer.map_func(final, lambda row: (row[0], row[1], row[2]))
    return final


train_set = select("train", dataset)
print(len(train_set))
Exemple #26
0
 def evaluate(self, dev_dataset):
     preds = self.predict(dev_dataset)
     ground = transform.map_func(dev_dataset, lambda row: int(row[2]))
     (precision, recall, fscore,
      support) = precision_recall_fscore_support(ground, preds)
     return fscore
Exemple #27
0
test_id_dataset = open(test_id_path, "r").read().splitlines()

from os import listdir
from os.path import isfile, join

files = listdir(anno_dir)
files = transformer.filter_func(files, lambda name: 'rating' in name)


def get_id(file_name):
    idx = file_name.index("rating")
    return file_name[:idx - 1]


# check completeness
anno_ids = transformer.map_func(files, lambda file_name: get_id(file_name))
anno_ids.sort()

train_test_ids = train_id_dataset + test_id_dataset
train_test_ids.sort()
assert (anno_ids == train_test_ids)

id_to_file = transformer.map_func(files, lambda file: (get_id(file), file))
id_to_file = dict(id_to_file)


def extractor(anno_dir, id_to_file, paper_id):
    file_path = anno_dir + id_to_file[paper_id]
    label_sent_dataset = csv_handler.csv_readlines(file_path, delimit='\t')

    indexed_result = transformer.indexleft_func(label_sent_dataset)
Exemple #28
0
    def train(self,
              labeled_dataset,
              train_batch_size=32,
              num_epoch=5,
              adam_lr=2e-5,
              adam_epsilon=1e-8,
              scheduler_warmup_steps=0,
              text_col=1,
              label_col=2):

        # prepare training data
        texts = transform.map_func(labeled_dataset, lambda tri: tri[text_col])

        labels = transform.map_func(labeled_dataset,
                                    lambda tri: tri[label_col])

        train_examples = self.__get_examples(texts, labels)

        train_dataset = self.__get_inputs(train_examples, ["0", "1"])

        train_dataset = TensorDataset(train_dataset['input_ids'],
                                      train_dataset['attention_mask'],
                                      train_dataset['token_type_ids'],
                                      train_dataset['labels'])

        train_sampler = RandomSampler(train_dataset)

        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=train_batch_size)

        # prepare optimizer
        optimizer = AdamW(self.model.parameters(),
                          lr=adam_lr,
                          eps=adam_epsilon)

        # prepare scheduler
        t_total = len(train_dataloader) * num_epoch

        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=0,
                                                    num_training_steps=t_total)

        # start training
        self.model.zero_grad()

        for _ in trange(0, num_epoch, desc="Training Epoch"):
            num_step_per_epoch = len(train_dataloader)
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                self.model.train()
                batch = tuple(t.to(self.device) for t in batch)
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "labels": batch[3]
                }
                outputs = self.model(**inputs)
                loss = outputs[0]
                if step in {
                        int(num_step_per_epoch / 4),
                        int(num_step_per_epoch * 2 / 4),
                        int(num_step_per_epoch * 3 / 4), num_step_per_epoch - 1
                }:
                    print("\n training loss is " + str(loss.item()))
                loss.backward()
                optimizer.step()
                scheduler.step()
                self.model.zero_grad()

        # return trained model
        return self.model
Exemple #29
0
import sys
sys.path.insert(0, "../../pyfunctor")

import transform as transform
import csv_handler as csv_handler

if __name__ == "__main__":
    result = []

    first_file = sys.argv[1]
    first_idx = sys.argv[2].split(',')
    first_idx = transform.map_func(first_idx, lambda idx: int(idx))

    second_file = sys.argv[3]
    second_idx = sys.argv[4].split(',')
    second_idx = transform.map_func(second_idx, lambda idx: int(idx))

    output_csv_file = sys.argv[5]

    first_dataset = csv_handler.csv_readlines(first_file)
    result = transform.map_func(first_dataset,
                                lambda row: [row[idx] for idx in first_idx])

    second_dataset = csv_handler.csv_readlines(second_file)
    second_result = transform.map_func(
        second_dataset, lambda row: [row[idx] for idx in second_idx])

    assert (len(first_dataset) == len(second_dataset))

    final = transform.map_func(zip(result, second_result),
                               lambda p: p[0] + p[1])
Exemple #30
0
root_directory = '../../'
sys.path.insert(0, root_directory + "pyfunctor")

import transform as transformer
import csv_handler as csv_handler

input_path = "./goodreads_reviews_spoiler.json"
output_path = "spoiler.csv"

fin = open(input_path, "r")
dataset = []
for row in fin.readlines():
    dataset.append(row)

json_dataset = transformer.map_func(dataset, lambda line: json.loads(line))


def format_func(line):
    num_true = line.count('"has_spoiler": true,')
    num_false = line.count('"has_spoiler": false,')
    if num_true == 0:
        assert (num_false == 1)
        line = line.replace('"has_spoiler": false,', '"has_spoiler": "false",')
    elif num_true == 1:
        assert (num_false == 0)
        line = line.replace('"has_spoiler": true,', '"has_spoiler": "true",')
    else:
        assert (False)

    line = json.loads(line)