Esempio n. 1
0
def transform(input_path, output_path):
    dataset = csv_handler.csv_readlines(input_path)
    dataset = dataset[1:]

    dataset = transformer.map_func(
        range(len(dataset)), lambda i: (i, dataset[i][0], 1
                                        if dataset[i][1] == "True" else 0))
    csv_handler.csv_writelines(output_path, dataset)
Esempio n. 2
0
def bert_estimate(input_path, text_col, output_path, model_dir, gpu,
                  with_header):

    # assign GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = gpu

    # load model
    if model_dir == '':
        model_dir = os.path.join(os.environ['FACTMINE_HOME'],
                                 'cache/company_model')

    print("Loading model from %s ...\n" % (model_dir))

    model = BertModel(model_dir)

    print("Calculating negative score, positive score, and argmax class...\n")

    dataset = csv_handler.csv_readlines(input_path)
    header = None
    if with_header == True:
        header = dataset[0]
        dataset = dataset[1:]

    text_col = text_col - 1
    texts = transform.map_func(dataset, lambda row: row[text_col])

    preds = model.predict(texts)

    assert (len(dataset) == len(preds))

    # apply sigmoid
    preds = transform.map_func(
        preds, lambda quad:
        [util.sigmoid(quad[0]),
         util.sigmoid(quad[1]), quad[2], quad[3]])

    for i in range(len(dataset)):
        dataset[i].append(preds[i][0])
        dataset[i].append(preds[i][1])
        dataset[i].append(preds[i][2])

    if with_header == True:
        header.append('score_0')
        header.append('score_1')
        header.append('arg_class')
        dataset = [header] + dataset

    csv_handler.csv_writelines(output_path, dataset)

    if output_path != "":
        print("Finished, Results are ready at %s " % (output_path))
Esempio n. 3
0
def extract(data, typename):
    dataset = csv_handler.csv_readlines("./dataset/" + data + "set.csv")
    dataset = transformer.indexleft_func(dataset)
    dataset = transformer.map_func(dataset, lambda row:
                                   (row[0], row[1][1], row[1][2]))
    output_path = "./" + data + ".csv"

    def e_func(triplet):
        label = 0
        if triplet[2] == typename:
            label = 1
        return (triplet[0], triplet[1], label)

    final = transformer.map_func(dataset, lambda triplet: e_func(triplet))
    csv_handler.csv_writelines(output_path, final)
Esempio n. 4
0
def max_balancer(input_csv_path, output_csv_path='./output.csv'):
    dataset = csv_handler.csv_readlines(input_csv_path)

    pos_dataset = transform.filter_func(dataset, lambda row: row[2] == '1')
    neg_dataset = transform.filter_func(dataset, lambda row: row[2] == '0')

    assert (len(pos_dataset) <= len(neg_dataset))
    sampler = Sampler()
    neg_dataset = sampler.sample_rows(neg_dataset, len(pos_dataset))

    pos_ids = transform.map_func(pos_dataset, lambda row: row[0])
    neg_ids = transform.map_func(neg_dataset, lambda row: row[0])

    select_id_set = set(pos_ids + neg_ids)
    final = transform.filter_func(dataset, lambda row: row[0] in select_id_set)

    csv_handler.csv_writelines(output_csv_path, final)
Esempio n. 5
0
def extract(data, typename):
    dataset = csv_handler.csv_readlines("./dataset/" + data + "_raw.csv")
    output_path = "./" + data + ".csv"

    def func_1(triplet):
        label = 0
        if triplet[2] == typename:
            label = 1
        return (triplet[0], triplet[1], label)

    e_func = func_1

    def func_2(triplet):
        label = 1
        if triplet[2] == typename:
            label = 0
        return (triplet[0], triplet[1], label)

    if typename == 'NoArgument':
        e_func = func_2

    final = transformer.map_func(dataset, lambda triplet: e_func(triplet))
    csv_handler.csv_writelines(output_path, final)
Esempio n. 6
0
def evaluate(input_path, col_true, col_pred, metric, output_path, with_header):
    '''evaluate the quality of predictions with a metric (f1 by default), and output the metric scores'''

    result = []
    dataset = csv_handler.csv_readlines(input_path)
    if with_header == True:
        dataset = dataset[1:]

    col_true = int(col_true) - 1
    col_pred = int(col_pred) - 1
    y_true = transform.map_func(dataset, lambda row : int(row[col_true]))
    y_pred = transform.map_func(dataset, lambda row : int(row[col_pred]))

    def check_validity(class_array):
        for cls in class_array:
            assert(cls == 0 or cls == 1)
    check_validity(y_true)
    check_validity(y_pred)

    support_set = {'f1', 'accuracy', 'cohen', 'quad'}
    if metric not in support_set:
        sys.exit('please specify a valid metric in terms of f1, accuracy, cohen, or quad (i.e. precision_recall_fscore_support)')
    elif metric == 'f1':
        result.append(['f1'])
        result.append([f1_score(y_true, y_pred)])
    elif metric == 'accuracy':
        result.append(['accuracy'])
        result.append([accuracy_score(y_true, y_pred)])
    elif metric == 'cohen':
        result.append([cohen_kappa_score(y_true, y_pred)]) 
    elif metric == 'quad':
        (precision, recall, fscore, support) = precision_recall_fscore_support(y_true, y_pred)
        result.append(['class', 'precision', 'recall', 'fscore', 'support'])
        result.append([0, precision[0], recall[0], fscore[0], support[0]])
        result.append([1, precision[1], recall[1], fscore[1], support[1]])

    csv_handler.csv_writelines(output_path, result)
Esempio n. 7
0
def write_preds(ori_preds, labels, dev_path , output_path = "./tmp.csv"):
    eval_examples = csv_handler.csv_readlines(dev_path)
    assert(len(ori_preds) == len(labels))
    assert(len(ori_preds) == len(eval_examples))
    # append header

    header = ['id', 'sent', 'label']
    for i in range(len(ori_preds[0])):
        header.append('p' + str(i))
    
    header.append('pred')

    final = []
    final.append(header)

    # append data
    for i in range(len(labels)):
        row = []
        sent_id = eval_examples[i][0]
        sent = eval_examples[i][1]
        sent_label = int(eval_examples[i][2])
        assert(sent_label == labels[i])

        row.append(sent_id)
        row.append(sent)
        row.append(sent_label)

        for j in range(len(ori_preds[i])):
            row.append(ori_preds[i][j])

        pred_label = np.argmax(ori_preds[i])

        row.append(pred_label)
        final.append(row)

    csv_handler.csv_writelines(output_path, final)
Esempio n. 8
0
        examples = []
        for (i, txt) in enumerate(batch):
            guid = "%s" % (i)
            text_a = txt 
            label = self.dummy_label
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples
        

if __name__ == "__main__":

    data_path = sys.argv[1]
    model_dir = sys.argv[2]
    output_path = sys.argv[3]

    # load test dataset
    raw_dataset = csv_handler.csv_readlines(data_path)
    ids = transform.map_func(raw_dataset, lambda row : row[0])
    texts = transform.map_func(raw_dataset, lambda row : row[1])

    # load model
    model = BertModel(model_dir)

    pred = model.predict(texts, 100)

    assert(len(ids) == len(pred))
    output = transform.map_func(range(len(ids)), lambda idx : [ids[idx]] + pred[idx])
    csv_handler.csv_writelines(output_path, output)
    
Esempio n. 9
0
        return (rid, sent, label, split)

    records = transformer.map_func(range(len(records)),
                                   lambda i: row_functor(i, records))

    print(len(records))
    dataset = dataset + records[1:]
print(len(dataset))
print(dataset[0])


def select(split, dataset):
    final = transformer.filter_func(dataset, lambda row: row[3] == split)
    final = transformer.map_func(final, lambda row: (row[0], row[1], row[2]))
    return final


train_set = select("train", dataset)
print(len(train_set))

dev_set = select("test", dataset)
print(len(dev_set))

val_set = select("val", dataset)
print(len(val_set))

csv_handler.csv_writelines(data_root + "/train_raw.csv", train_set)
csv_handler.csv_writelines(data_root + "/dev_raw.csv", dev_set)
csv_handler.csv_writelines(data_root + "/val_raw.csv", val_set)
Esempio n. 10
0
    #('clf', LogisticRegression(class_weight='balanced', random_state=seed, solver='liblinear')),
    ('clf', LogisticRegression(random_state=seed, solver='liblinear')),
])

text_clf.fit(X_train, y_train)
train_finish_time = time.time()
train_duration = train_finish_time - start_time
print("train time is " + str(train_finish_time - start_time))

print("predicting...")

predicted = text_clf.predict(X_dev)
predicted_proba = text_clf.predict_proba(X_dev)

assert (len(predicted_proba) == len(X_dev))
assert (len(X_dev) == len(y_dev))

print("logging...")
csv_handler.append_row(output_path,
                       ['score_0', 'score_1', 'predict', 'text', 'ground'])
result = []
for i in range(len(predicted_proba)):
    score_0 = predicted_proba[i][0]
    score_1 = predicted_proba[i][1]
    predict = predicted[i]
    text = X_dev[i]
    ground = y_dev[i]
    result.append([score_0, score_1, predict, text, ground])

csv_handler.csv_writelines(output_path, result)
Esempio n. 11
0
train_test_ids.sort()
assert (anno_ids == train_test_ids)

id_to_file = transformer.map_func(files, lambda file: (get_id(file), file))
id_to_file = dict(id_to_file)


def extractor(anno_dir, id_to_file, paper_id):
    file_path = anno_dir + id_to_file[paper_id]
    label_sent_dataset = csv_handler.csv_readlines(file_path, delimit='\t')

    indexed_result = transformer.indexleft_func(label_sent_dataset)
    final = transformer.map_func(
        indexed_result, lambda p:
        (paper_id + "_" + str(p[0]), p[1][1], p[1][0]))
    return final


tmp = extractor(anno_dir, id_to_file, 'r18RxrXlG')
transformer.print_rows(tmp, 3)

train_dataset = transformer.flatmap_func(
    train_id_dataset,
    lambda paper_id: extractor(anno_dir, id_to_file, paper_id))
csv_handler.csv_writelines(train_output_path, train_dataset)

test_dataset = transformer.flatmap_func(
    test_id_dataset,
    lambda paper_id: extractor(anno_dir, id_to_file, paper_id))
csv_handler.csv_writelines(test_output_path, test_dataset)
Esempio n. 12
0
import transform as transform
import csv_handler as csv_handler

if __name__ == "__main__":
    result = []

    first_file = sys.argv[1]
    first_idx = sys.argv[2].split(',')
    first_idx = transform.map_func(first_idx, lambda idx: int(idx))

    second_file = sys.argv[3]
    second_idx = sys.argv[4].split(',')
    second_idx = transform.map_func(second_idx, lambda idx: int(idx))

    output_csv_file = sys.argv[5]

    first_dataset = csv_handler.csv_readlines(first_file)
    result = transform.map_func(first_dataset,
                                lambda row: [row[idx] for idx in first_idx])

    second_dataset = csv_handler.csv_readlines(second_file)
    second_result = transform.map_func(
        second_dataset, lambda row: [row[idx] for idx in second_idx])

    assert (len(first_dataset) == len(second_dataset))

    final = transform.map_func(zip(result, second_result),
                               lambda p: p[0] + p[1])

    csv_handler.csv_writelines(output_csv_file, final)
Esempio n. 13
0
# get idx for review_id, text, and funny_count
def get_triplet_idx(header):
    idx_id = header.index('review_id')
    idx_text = header.index('text')
    idx_count = header.index('funny')
    return (idx_id, idx_text, idx_count)


def selector(row, idx_id, idx_text, idx_count):
    r_id = row[idx_id]
    text = row[idx_text]
    funny_count = int(row[idx_count])
    is_funny = None
    if (funny_count >= 5):
        is_funny = 1
    elif (funny_count == 0):
        is_funny = 0
    return (r_id, text, is_funny)


(idx_id, idx_text, idx_count) = get_triplet_idx(dataset[0])

selected_datasets = transformer.map_func(
    dataset[1:], lambda line: selector(line, idx_id, idx_text, idx_count))

final_datasets = transformer.filter_func(selected_datasets,
                                         lambda row: row[2] != None)

csv_handler.csv_writelines(review_output_path, final_datasets)
Esempio n. 14
0
    elif num_true == 1:
        assert (num_false == 0)
        line = line.replace('"has_spoiler": true,', '"has_spoiler": "true",')
    else:
        assert (False)

    line = json.loads(line)
    return line


json_dataset = transformer.map_func(dataset, lambda line: format_func(line))

id_sents = transformer.map_func(
    json_dataset, lambda jsn: (jsn['review_id'], jsn['review_sentences']))


def flat_sents_func(review_id, sents):
    result = []
    for i in range(len(sents)):
        sent_id = review_id + "###" + str(i)
        sent_text = sents[i][1]
        sent_label = sents[i][0]
        result.append((sent_id, sent_text, sent_label))
    return result


final_sents = transformer.flatmap_func(id_sents,
                                       lambda p: flat_sents_func(p[0], p[1]))

csv_handler.csv_writelines(output_path, final_sents)