Example #1
0
def finetune(input_path, output_model_dir, text_col, label_col, model_dir, gpu, with_header):
    '''Train a new model or finetune an existing model with labels, output fine-tuned model'''

    # assign GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = gpu

    dataset = csv_handler.csv_readlines(input_path)
    header = None
    if with_header == True:
        header = dataset[0]
        dataset = dataset[1:]

    print("Loading source model from %s ...\n" % (model_dir))
    model = BertModel(model_dir)

    text_col = text_col - 1
    label_col = label_col - 1

    labels = transform.map_func(range(len(dataset)), lambda i : [i, dataset[i][text_col], dataset[i][label_col]])

    print("Fine-tuning with input labels")
    model.train(labels)

    model.checkpoint(output_model_dir)

    print("Finished. Fine-tuned model is ready at " + output_model_dir)
Example #2
0
def transform(input_path, output_path):
    dataset = csv_handler.csv_readlines(input_path)
    dataset = dataset[1:]

    dataset = transformer.map_func(
        range(len(dataset)), lambda i: (i, dataset[i][0], 1
                                        if dataset[i][1] == "True" else 0))
    csv_handler.csv_writelines(output_path, dataset)
Example #3
0
def extractor(anno_dir, id_to_file, paper_id):
    file_path = anno_dir + id_to_file[paper_id]
    label_sent_dataset = csv_handler.csv_readlines(file_path, delimit='\t')

    indexed_result = transformer.indexleft_func(label_sent_dataset)
    final = transformer.map_func(
        indexed_result, lambda p:
        (paper_id + "_" + str(p[0]), p[1][1], p[1][0]))
    return final
Example #4
0
def bert_estimate(input_path, text_col, output_path, model_dir, gpu,
                  with_header):

    # assign GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = gpu

    # load model
    if model_dir == '':
        model_dir = os.path.join(os.environ['FACTMINE_HOME'],
                                 'cache/company_model')

    print("Loading model from %s ...\n" % (model_dir))

    model = BertModel(model_dir)

    print("Calculating negative score, positive score, and argmax class...\n")

    dataset = csv_handler.csv_readlines(input_path)
    header = None
    if with_header == True:
        header = dataset[0]
        dataset = dataset[1:]

    text_col = text_col - 1
    texts = transform.map_func(dataset, lambda row: row[text_col])

    preds = model.predict(texts)

    assert (len(dataset) == len(preds))

    # apply sigmoid
    preds = transform.map_func(
        preds, lambda quad:
        [util.sigmoid(quad[0]),
         util.sigmoid(quad[1]), quad[2], quad[3]])

    for i in range(len(dataset)):
        dataset[i].append(preds[i][0])
        dataset[i].append(preds[i][1])
        dataset[i].append(preds[i][2])

    if with_header == True:
        header.append('score_0')
        header.append('score_1')
        header.append('arg_class')
        dataset = [header] + dataset

    csv_handler.csv_writelines(output_path, dataset)

    if output_path != "":
        print("Finished, Results are ready at %s " % (output_path))
Example #5
0
def extract(data, typename):
    dataset = csv_handler.csv_readlines("./dataset/" + data + "set.csv")
    dataset = transformer.indexleft_func(dataset)
    dataset = transformer.map_func(dataset, lambda row:
                                   (row[0], row[1][1], row[1][2]))
    output_path = "./" + data + ".csv"

    def e_func(triplet):
        label = 0
        if triplet[2] == typename:
            label = 1
        return (triplet[0], triplet[1], label)

    final = transformer.map_func(dataset, lambda triplet: e_func(triplet))
    csv_handler.csv_writelines(output_path, final)
Example #6
0
def max_balancer(input_csv_path, output_csv_path='./output.csv'):
    dataset = csv_handler.csv_readlines(input_csv_path)

    pos_dataset = transform.filter_func(dataset, lambda row: row[2] == '1')
    neg_dataset = transform.filter_func(dataset, lambda row: row[2] == '0')

    assert (len(pos_dataset) <= len(neg_dataset))
    sampler = Sampler()
    neg_dataset = sampler.sample_rows(neg_dataset, len(pos_dataset))

    pos_ids = transform.map_func(pos_dataset, lambda row: row[0])
    neg_ids = transform.map_func(neg_dataset, lambda row: row[0])

    select_id_set = set(pos_ids + neg_ids)
    final = transform.filter_func(dataset, lambda row: row[0] in select_id_set)

    csv_handler.csv_writelines(output_csv_path, final)
Example #7
0
def extract(data, typename):
    dataset = csv_handler.csv_readlines("./dataset/" + data + "_raw.csv")
    output_path = "./" + data + ".csv"

    def func_1(triplet):
        label = 0
        if triplet[2] == typename:
            label = 1
        return (triplet[0], triplet[1], label)

    e_func = func_1

    def func_2(triplet):
        label = 1
        if triplet[2] == typename:
            label = 0
        return (triplet[0], triplet[1], label)

    if typename == 'NoArgument':
        e_func = func_2

    final = transformer.map_func(dataset, lambda triplet: e_func(triplet))
    csv_handler.csv_writelines(output_path, final)
Example #8
0
def evaluate(input_path, col_true, col_pred, metric, output_path, with_header):
    '''evaluate the quality of predictions with a metric (f1 by default), and output the metric scores'''

    result = []
    dataset = csv_handler.csv_readlines(input_path)
    if with_header == True:
        dataset = dataset[1:]

    col_true = int(col_true) - 1
    col_pred = int(col_pred) - 1
    y_true = transform.map_func(dataset, lambda row : int(row[col_true]))
    y_pred = transform.map_func(dataset, lambda row : int(row[col_pred]))

    def check_validity(class_array):
        for cls in class_array:
            assert(cls == 0 or cls == 1)
    check_validity(y_true)
    check_validity(y_pred)

    support_set = {'f1', 'accuracy', 'cohen', 'quad'}
    if metric not in support_set:
        sys.exit('please specify a valid metric in terms of f1, accuracy, cohen, or quad (i.e. precision_recall_fscore_support)')
    elif metric == 'f1':
        result.append(['f1'])
        result.append([f1_score(y_true, y_pred)])
    elif metric == 'accuracy':
        result.append(['accuracy'])
        result.append([accuracy_score(y_true, y_pred)])
    elif metric == 'cohen':
        result.append([cohen_kappa_score(y_true, y_pred)]) 
    elif metric == 'quad':
        (precision, recall, fscore, support) = precision_recall_fscore_support(y_true, y_pred)
        result.append(['class', 'precision', 'recall', 'fscore', 'support'])
        result.append([0, precision[0], recall[0], fscore[0], support[0]])
        result.append([1, precision[1], recall[1], fscore[1], support[1]])

    csv_handler.csv_writelines(output_path, result)
Example #9
0
def write_preds(ori_preds, labels, dev_path , output_path = "./tmp.csv"):
    eval_examples = csv_handler.csv_readlines(dev_path)
    assert(len(ori_preds) == len(labels))
    assert(len(ori_preds) == len(eval_examples))
    # append header

    header = ['id', 'sent', 'label']
    for i in range(len(ori_preds[0])):
        header.append('p' + str(i))
    
    header.append('pred')

    final = []
    final.append(header)

    # append data
    for i in range(len(labels)):
        row = []
        sent_id = eval_examples[i][0]
        sent = eval_examples[i][1]
        sent_label = int(eval_examples[i][2])
        assert(sent_label == labels[i])

        row.append(sent_id)
        row.append(sent)
        row.append(sent_label)

        for j in range(len(ori_preds[i])):
            row.append(ori_preds[i][j])

        pred_label = np.argmax(ori_preds[i])

        row.append(pred_label)
        final.append(row)

    csv_handler.csv_writelines(output_path, final)
Example #10
0
from sklearn.feature_extraction.text import TfidfTransformer
import os
file_path = os.path.dirname(os.path.abspath(__file__))
import sys
sys.path.insert(0, '../../pyfunctor')
import csv_handler as csv_handler
import transform as transformer
from sklearn import metrics
import time
import gensim
from pytorch_pretrained_bert import BertTokenizer
import os

start_time = time.time()

train_set = csv_handler.csv_readlines(sys.argv[1])
dev_set = csv_handler.csv_readlines(sys.argv[2])

output_path = sys.argv[3]
if os.path.exists(output_path):
    os.remove(output_path)

seed = int(sys.argv[4])


def sep(dataset):
    sents = transformer.map_func(dataset, lambda triplet: triplet[1])
    labels = transformer.map_func(dataset, lambda triplet: (int)(triplet[2]))
    return (sents, labels)

Example #11
0
sys.path.insert(0, "../../pyfunctor")
import csv_handler as csv_handler
import transform as transformer
import csv


class CSV_Split(csv_handler.CSV_Handler):
    def __init__(self, dataset, seed=0):
        self.seed = seed
        self.dataset = dataset


input_dir = "./DEEPTip/dataset/"
# paragraph
sent_n = csv_handler.csv_readlines(input_dir + "sent_tip.neg",
                                   delimit='\t',
                                   quoter=csv.QUOTE_NONE)
sent_n = transformer.map_func(sent_n, lambda p: (p[0], p[1], '0'))

sent_p = csv_handler.csv_readlines(input_dir + "sent_tip.pos",
                                   delimit='\t',
                                   quoter=csv.QUOTE_NONE)
sent_p = transformer.map_func(sent_p, lambda p: (p[0], p[1], '1'))

sent = sent_n + sent_p
for row in sent:
    assert (len(row) == 3)
ids = set(transformer.map_func(sent, lambda p: p[0]))
assert (len(ids) == len(sent))

splitter = CSV_Split(sent)
Example #12
0
root.tag
sents = []
for child in root:
    tid = child.attrib['id']
    sentence = ""

    num_words = len(child)
    for i in range(num_words):
        sentence += child[i].text
        if i < num_words - 2:
            sentence += " "
    sents.append((tid, sentence))

import csv_handler as csv_handler
golds = csv_handler.csv_readlines(gold_path, delimit='\t')

import transform as transformer
assert(len(sents) == len(golds))
for i in range(len(sents)):
    assert(sents[i][0] == golds[i][0])
final = transformer.map_func(range(len(sents)), lambda i : (sents[i][0], sents[i][1], golds[i][1]))

import csv_handler as csv_handler
class CSV_Split(csv_handler.CSV_Handler):
    def __init__(self, dataset, seed = 0):
        self.seed = seed
        self.dataset = dataset

splitter = CSV_Split(final)
splitter.csv_split(0.2, "dev.csv", "train.csv")
Example #13
0
import transform as transform
import csv_handler as csv_handler

if __name__ == "__main__":
    result = []

    first_file = sys.argv[1]
    first_idx = sys.argv[2].split(',')
    first_idx = transform.map_func(first_idx, lambda idx: int(idx))

    second_file = sys.argv[3]
    second_idx = sys.argv[4].split(',')
    second_idx = transform.map_func(second_idx, lambda idx: int(idx))

    output_csv_file = sys.argv[5]

    first_dataset = csv_handler.csv_readlines(first_file)
    result = transform.map_func(first_dataset,
                                lambda row: [row[idx] for idx in first_idx])

    second_dataset = csv_handler.csv_readlines(second_file)
    second_result = transform.map_func(
        second_dataset, lambda row: [row[idx] for idx in second_idx])

    assert (len(first_dataset) == len(second_dataset))

    final = transform.map_func(zip(result, second_result),
                               lambda p: p[0] + p[1])

    csv_handler.csv_writelines(output_csv_file, final)
Example #14
0
import sys

root_directory = '../../'
sys.path.insert(0, root_directory + "pyfunctor")

import transform as transformer
import csv_handler as csv_handler
import json_handler as json_handler

review_input_path = "../FUNNY/yelp_academic_dataset_review.csv"
review_output_path = "./all.csv"

dataset = csv_handler.csv_readlines(review_input_path)


# get idx for review_id, text, and funny_count
def get_triplet_idx(header):
    idx_id = header.index('review_id')
    idx_text = header.index('text')
    idx_count = header.index('funny')
    return (idx_id, idx_text, idx_count)


def selector(row, idx_id, idx_text, idx_count):
    r_id = row[idx_id]
    text = row[idx_text]
    funny_count = int(row[idx_count])
    is_funny = None
    if (funny_count >= 5):
        is_funny = 1
    elif (funny_count == 0):
Example #15
0
import sys
sys.path.insert(0, '../../pyfunctor')
import transform as transformer
import csv_handler as csv_handler

input_file=sys.argv[1]
output_file=sys.argv[2]
dataset = csv_handler.csv_readlines(input_file)

dataset = transformer.indexleft_func(dataset)
final = transformer.map_func(dataset, lambda p: (p[0], p[1][2], int(p[1][0]) - 1))

csv_handler.csv_writelines(output_file, final)
Example #16
0
def distill(input_path):
    dataset = csv_handler.csv_readlines(input_path, delimit='\t')
    return dataset
Example #17
0
 def __init__(self, csv_file_path):
     dataset = csv_handler.csv_readlines(csv_file_path)
     labels = transform.map_func(dataset, lambda t: t[2])
     self.weight_map = weight_class(labels)
Example #18
0
import csv_handler as csv_handler
import transform as transformer

data_root = "./dataset"
data_dir = data_root + "/data/complete/"
topics = [
    'abortion', 'cloning', 'death_penalty', 'gun_control',
    'marijuana_legalization', 'minimum_wage', 'nuclear_energy',
    'school_uniforms'
]

dataset = []
for tp in topics:
    file_path = data_dir + tp + ".tsv"
    records = csv_handler.csv_readlines(file_path,
                                        delimit='\t',
                                        quoter=csv.QUOTE_NONE)
    records = records[1:]

    def row_functor(i, records):
        assert (i < len(records))
        row = records[i]

        rid = row[0] + "_" + str(i)
        sent = row[4]
        label = row[5]
        split = row[6]

        return (rid, sent, label, split)

    records = transformer.map_func(range(len(records)),
Example #19
0
    csv_input_path = sys.argv[1]
    y_true_col = int(sys.argv[2])
    y_pred_col = int(sys.argv[3])
    num_threds = int(sys.argv[4])
    csv_output_path = sys.argv[5]

    print_header = '1'
    if len(sys.argv) > 6:
        print_header = sys.argv[6]
    assert (print_header == '1' or print_header == '0')

    thred_method = "min_max_even"
    if os.path.exists(csv_output_path):
        os.remove(csv_output_path)

    csv_dataset = csv_handler.csv_readlines(csv_input_path)

    y_true = transform.map_func(csv_dataset, lambda row: int(row[y_true_col]))
    y_pred_score = transform.map_func(csv_dataset,
                                      lambda row: float(row[y_pred_col]))

    #y_pred_score = transform.map_func(y_pred_score, lambda score : 1 / (1 + math.exp(-score)))

    thred_col = []

    if thred_method == "min_max_even":
        thred_col = get_threds_by_min_max_even(y_pred_score, num_threds)

    else:  # sorted score even slot
        thred_col = get_threds_by_sorted_score_equal_length(
            y_pred_score, num_threds)
Example #20
0
        examples = []
        for (i, txt) in enumerate(batch):
            guid = "%s" % (i)
            text_a = txt 
            label = self.dummy_label
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples
        

if __name__ == "__main__":

    data_path = sys.argv[1]
    model_dir = sys.argv[2]
    output_path = sys.argv[3]

    # load test dataset
    raw_dataset = csv_handler.csv_readlines(data_path)
    ids = transform.map_func(raw_dataset, lambda row : row[0])
    texts = transform.map_func(raw_dataset, lambda row : row[1])

    # load model
    model = BertModel(model_dir)

    pred = model.predict(texts, 100)

    assert(len(ids) == len(pred))
    output = transform.map_func(range(len(ids)), lambda idx : [ids[idx]] + pred[idx])
    csv_handler.csv_writelines(output_path, output)
    
Example #21
0
def distill(input_path, label_map):
    dataset = csv_handler.csv_readlines(input_path, delimit='\t')
    dataset = transform.filter_func(dataset, lambda row: len(row) >= 5)
    token_label = transform.map_func(
        dataset, lambda row: [row[0].strip(), label_map[row[4].strip()[0]]])
    return token_label