def run(model, csvs, threshold, evaluation): labels = [ "anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust", "neutral" ] predictor = BertClassificationPredictor( model_path=args.model_dir, label_path= "D:\\UTD\\Assignment\\NLP\\project\\", # location for labels.csv file multi_label=False, model_type='bert', do_lower_case=False) inputs = {} ids = [] data = pd.read_csv(csvs) # print(data.head()) for idx, row in data.iterrows(): temp = [] for label in labels: if row[label] == 1: temp.append(label) inputs[row['text']] = temp ids.append(row['id']) multiple_predictions = predictor.predict_batch(list(inputs.keys())) outputs = [] out_file = open(os.path.join(os.path.dirname(csvs), "model_output.csv"), "w", encoding="utf-8", newline="") csv_writer = csv.writer(out_file) csv_writer.writerow(["id", "text", "emotions", "target"]) for i, out in enumerate(multiple_predictions): temp = [] for emotion in out: if emotion[1] > threshold: # greater than threshold temp.append(emotion[0]) csv_writer.writerow( [ids[i], list(inputs.keys())[i], temp, list(inputs.values())[i]]) outputs.append(temp) print("****************\n") print("Predictions saved in a file: ", os.path.join(os.path.dirname(csvs), "model_output.csv")) if evaluation: print("\n\n Running Model Evaluation\n") y_true = list(inputs.values()) y_pred = outputs y_true_encoded = MultiLabelBinarizer().fit_transform(y_true) y_pred_encoded = MultiLabelBinarizer().fit_transform(y_pred) pprint(classification_report(y_true_encoded, y_pred_encoded)) pprint( classification_report(y_true_encoded, y_pred_encoded, target_names=labels))
def predict_proba(self, x): print('\tpredicting probabilities...') # create pandas dataframe df = pd.DataFrame({'comment_text': x}) df.to_csv(self.experiment_parameters.DATA_PATH/'test.csv') # create predictor object output_dir = 'output-%d/model_out'%self.experiment_parameters.random_state predictor = BertClassificationPredictor(model_path=(self.experiment_parameters.MODEL_PATH/output_dir).absolute().as_posix(), label_path=self.experiment_parameters.LABEL_PATH, multi_label=True, model_type=self.experiment_parameters.MODEL_TYPE, do_lower_case=True) # predict test labels output = predictor.predict_batch(list(pd.read_csv(self.experiment_parameters.DATA_PATH/'test.csv')['comment_text'].values)) # dump results pd.DataFrame(output).to_csv(self.experiment_parameters.PRED_PATH/self.experiment_parameters.RESULTS_FILENAME) # clean output preds = pd.DataFrame([{item[0]: item[1] for item in pred} for pred in output]) print(preds.head(5)) y_pred_prob = preds.values.reshape(-1) y_pred_prob = pd.Series(y_pred_prob) return y_pred_prob
def __init__(self, args): self.gen_model_type = args.gen_model_type self.gen_model_path = args.gen_model_path self.conv_line_path = args.conv_line_path self.gen_length = args.length self.temperature = args.temperature self.top_k = args.top_k self.top_p = args.top_p self.stop_token = args.stop_token self.repetition_penalty = args.repetition_penalty self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.gen_model_type = self.gen_model_type.lower() self.lookup = { '1': 'Fashion', '2': 'Politics', '3': 'Books', '4': 'Sports', '5': 'General Entertainment', '6': 'Music', '7': 'Science & Technology', '8': 'Movie', '9': 'General' } self.topic_cls = BertClassificationPredictor( model_path=args.topic_cls_path, label_path=args. label_dir, #sys.argv[2], # directory for labels.csv file multi_label=False, model_type='bert', do_lower_case=True) self.entity_ext_model = AutoModelForTokenClassification.from_pretrained( "dbmdz/bert-large-cased-finetuned-conll03-english") self.entity_ext_model.to(self.device) self.entity_ext_tokenizer = AutoTokenizer.from_pretrained( "bert-base-cased") if self.gen_model_type == 'dialogpt': self.gen_tokenizer = AutoTokenizer.from_pretrained( self.gen_model_path) self.gen_model = AutoModelWithLMHead.from_pretrained( self.gen_model_path) self.gen_model.cuda() self.gen_model.eval() elif self.gen_model_type == 'bart': self.gen_model = BARTModel.from_pretrained( self.gen_model_path, checkpoint_file='checkpoint_best.pt', data_name_or_path=self.gen_model_path) self.gen_model.cuda() self.gen_model.eval() self.conv_line = BARTModel.from_pretrained( self.conv_line_path, checkpoint_file='checkpoint_best.pt', data_name_or_path=self.conv_line_path) self.conv_line.cuda() self.conv_line.eval()
def __init__(self, model_path, label_path): self.predictor = BertClassificationPredictor( model_path=model_path, label_path=label_path, # location for labels.csv file multi_label=False, model_type='bert', do_lower_case=False) self.preprocessor = TextPreprocessor()
def get_predictor_model(cls): #print(cls.searching_all_files(PATH)) # Get model predictor if cls.model == None: with open(PATH/'model_config.json') as f: model_config = json.load(f) predictor = BertClassificationPredictor(PATH/'model_out', label_path=PATH, ) predictor = BertClassificationPredictor(PATH/'model_out', label_path=PATH, multi_label=bool(model_config['multi_label']), model_type=model_config['model_type'], do_lower_case=bool(model_config['do_lower_case'])) cls.model = predictor return cls.model
class WebappConfig(AppConfig): name = 'fastbert' MODEL_PATH = pathlib.Path("model") BERT_PRETRAINED_PATH = pathlib.Path("model/uncased_L-12_H-768_A-12/") LABEL_PATH = pathlib.Path("label/") predictor = BertClassificationPredictor( model_path=MODEL_PATH / "multilabel-emotion-fastbert-basic.bin", pretrained_path=BERT_PRETRAINED_PATH, label_path=LABEL_PATH, multi_label=True)
def get_predictor(train_for): # 开始构建预测模型 output_dir = Path('./models/%s/output/model_out' % train_for) label_dir = Path('./labels/%s/' % train_for) predictor = BertClassificationPredictor(model_path=output_dir, label_path=label_dir, multi_label=True, model_type='bert', do_lower_case=True) return predictor
def predict(self, text): predictor = BertClassificationPredictor( model_path=self.in_dir + '/' + self.model_dir, label_path=self.in_dir + '/labels', # location for labels.csv file multi_label=True, # model_type='xlnet', do_lower_case=True) prediction = predictor.predict(str(text))[:7] rst_list = [] for i in range(len(prediction)): rst_list.append(" #" + str(prediction[i][0])) return rst_list
def threshold(model, csvs): labels = [ "anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust", "neutral" ] predictor = BertClassificationPredictor( model_path=args.model_dir, label_path= "D:\\UTD\\Assignment\\NLP\\project\\", # location for labels.csv file multi_label=False, model_type='bert', do_lower_case=False) thresholds = [ 0.0005, 0.00077, 0.00079, 0.00083, 0.00087, 0.0009, 0.00093, 0.00095, 0.00099, 0.001, 0.0012, 0.0015, 0.00155, 0.0016, 0.00166, 0.0017, 0.0019, 0.002, 0.0021, 0.0023, 0.0025, 0.0028, 0.003, 0.0035, 0.0032, 0.0037, 0.004, 0.0045, 0.0047, 0.0041, 0.005, 0.0053, 0.0055, 0.0062, 0.009, 0.007, 0.01, 0.011, 0.013, 0.014, 0.012, 0.015, 0.02, 0.25, 0.03, 0.035, 0.039 ] # targets = [] inputs = {} data = pd.read_csv(csvs) # print(data.head()) for idx, row in data.iterrows(): temp = [] for label in labels: if row[label] == 1: temp.append(label) inputs[row['text']] = temp multiple_predictions = predictor.predict_batch(list(inputs.keys())) threshold_accs = {} for th in thresholds: correct = 0 # print(list(inputs.values())[0]) outputs = [] for out in multiple_predictions: temp = [] for emotion in out: if emotion[1] >= th: # greater than threshold temp.append(emotion[0]) outputs.append(temp) # print(outputs[0]) for i in range(len(inputs)): if (set(outputs[i]) == set(list(inputs.values())[i])): correct += 1 print("Threshold: ", th, "Correct: ", correct) threshold_accs[str(th)] = correct / len(inputs) print(threshold_accs)
def __init__(self, model_dir, label_to_idx=label_to_idx): super().__init__() model_dir = Path(model_dir) model_config = model_dir / 'model_config.json' with open(model_config) as f: config = json.load(f) self.model = BertClassificationPredictor( model_path=str(model_dir / 'model_out'), label_path=str(model_dir), # location for labels.csv file model_type=config['model_type'], multi_label=config['multi_label'], do_lower_case=config['do_lower_case'], ) self.label_to_idx = label_to_idx
class WebappConfig(AppConfig): name = 'webapp' pwd = os.getcwd() HOME = os.environ["HOME"] #MODEL_PATH = Path("required_files") #BERT_PRETRAINED_PATH = Path(pwd+"/webapp/required_files/bert_model/uncased_L-12_H-768_A-12") #LABEL_PATH = Path(pwd+"/webapp/labels/") #predictor = BertClassificationPredictor(model_path=pwd+"/webapp/required_files/emotion_try.bin", pretrained_path=BERT_PRETRAINED_PATH, label_path=LABEL_PATH, multi_label=True) LABEL_PATH = Path(pwd + "/webapp/labels/") MODEL_PATH = Path(pwd + "/webapp/model_out_roberta") predictor = BertClassificationPredictor( model_path=MODEL_PATH, label_path=LABEL_PATH, # location for labels.csv file multi_label=False, model_type='roberta', do_lower_case=True)
def classify_bert(text, model_path): """Classify genre using fast-bert. Fast-bert automatically uses GPU if `torch.cuda.is_available() == True` Parameters ----------- text : <str or list(str)> for single prediction or multiprediction model_path : <str> must contain labels.csv (I've put one in the uploaded version) AND all model files (config.json, pytorch_model.bin, special_tokens_map.json, tokenizer_config.json, vocab.txt) Returns --------- str: if type(text) == str list: if type(text) == list or numpy array """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") predictor = BertClassificationPredictor( model_path=model_path, label_path=model_path, # location for labels.csv file multi_label=True, model_type='bert', do_lower_case=False) # predictor.to(device) if isinstance(text, str): # Single prediction pred = predictor.predict(text) pred = dict(pred) # single_prediction = predictor.predict("just get me result for this text") elif isinstance(text, list) or isinstance(text, np.ndarray): pred = predictor.predict_batch(text) # # Batch predictions # texts = [ # "this is the first text", # "this is the second text" # ] for i in range(len(pred)): pred[i] = dict(pred[i]) # multiple_predictions = predictor.predict_batch(texts) else: raise ValueError("Unexpected type for input argument `text`") return pred
def get_predictor_model(cls): # print(cls.searching_all_files(PATH)) # Get model predictor if cls.model is None: with open(os.path.join(PATH, "model_config.json")) as f: model_config = json.load(f) predictor = BertClassificationPredictor( os.path.join(PATH, "model_out"), label_path=PATH, multi_label=bool(model_config["multi_label"]), model_type=model_config["model_type"], do_lower_case=bool(model_config["do_lower_case"]), ) cls.model = predictor return cls.model
def prediction(text_list: list, case_type: str): if case_type == 'divorce': model_path = '/home/zf/lyy/Data/divorce/data/model/kedaV3/model_out' label_path = '/home/zf/lyy/Data/divorce/data/new_data' elif case_type == 'loan': model_path = '/home/zf/lyy/Data/loan/data/model/keda/model_out' label_path = '/home/zf/lyy/Data/loan/new_data' elif case_type == 'labor': model_path = '/home/zf/lyy/Data/labor/data/model/keda/model_out' label_path = '/home/zf/lyy/Data/labor/new_data' else: raise Exception('No this type') tag_dict = build_tags_dict(case_type) predictor = BertClassificationPredictor(model_path=model_path, label_path=label_path, multi_label=True, model_type='bert') output = predictor.predict_batch(text_list) detail_dic = {} result_dic = {} for i in range(len(output)): lab_list = [] for key in output[i]: if float(key[1]) > 0.5: if key[0] not in result_dic.keys(): result_dic[key[0]] = 1 else: result_dic[key[0]] += 1 if '21' not in key[0]: lab_list.append(tag_dict[key[0]]) detail_dic[text_list[i]] = lab_list final_result_dic = {} for key in result_dic.keys(): if '21' not in key: final_result_dic[tag_dict[key]] = result_dic[key] return final_result_dic, detail_dic
def predict_bert(experiment_parameters): # create predictor object predictor = BertClassificationPredictor(model_path=(experiment_parameters.MODEL_PATH/'output/model_out').absolute().as_posix(), label_path=experiment_parameters.LABEL_PATH, multi_label=True, model_type=experiment_parameters.MODEL_TYPE, do_lower_case=True) # predict test labels output = predictor.predict_batch(list(pd.read_csv(experiment_parameters.DATA_PATH/'test.csv')['comment_text'].values)) # dump results pd.DataFrame(output).to_csv(experiment_parameters.PRED_PATH/experiment_parameters.RESULTS_FILENAME) # clean output preds = pd.DataFrame([{item[0]: item[1] for item in pred} for pred in output]) print(preds.head()) # load test data df_test = pd.read_csv(experiment_parameters.DATA_PATH/'test.csv') print(df_test.head()) # merge dataframes df_pred = pd.merge(df_test, preds, how='left', left_index=True, right_index=True) del df_pred['comment_text'] #df_pred = df_pred['id', 'obscene'] df_pred['ground_truth'] = df_pred['%s_x'%LABEL_COLS[0]] df_pred['pred_prob'] = df_pred['%s_y'%LABEL_COLS[0]] del df_pred['%s_x'%LABEL_COLS[0]] del df_pred['%s_y'%LABEL_COLS[0]] print(df_pred.head()) # write results to file df_pred.to_csv(experiment_parameters.PRED_PATH/experiment_parameters.RESULTS_FILENAME, index=None) return
return app if __name__ == '__main__': path = 'models/model_out/pytorch_model.bin' bucket_path = 'https://storage.cloud.google.com/boast-trained-models/activity_classifier/pytorch_model.bin' # fetch model from google storage if not exist if bucket_path is not None and not os.path.exists(path): # set env key if 'GOOGLE_APPLICATION_CREDENTIALS' not in os.environ: os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'gcp_auth.json' client = storage.Client() bucket = client.get_bucket('boast-trained-models') blob = bucket.get_blob('activity_classifier/pytorch_model.bin') print('Downloading model...') with open(path, 'wb') as file_obj: blob.download_to_file(file_obj) predictor = BertClassificationPredictor( model_path='models/model_out', label_path='train', multi_label=False, model_type='distilbert', do_lower_case=True) serve(create_app(predictor), host='0.0.0.0', port=5000)
# if eos_token_id is not None: # input_ids[i] = input_ids[i] + [eos_token_id] length = [len(ids) for ids in input_ids] max_length = max(length) for i in range(len(input_ids)): while len(input_ids[i]) < max_length: input_ids[i].append(eos_token_id) return np.array(input_ids), np.array(length) # load trained emotion classifier predictor = BertClassificationPredictor( model_path=FLAGS.clf_output_dir, label_path=FLAGS.clf_label_dir, # location for labels.csv file multi_label=True, model_type='bert', do_lower_case=True) def main(_): """ Builds the model and runs """ if FLAGS.distributed: import horovod.tensorflow as hvd hvd.init() tf.logging.set_verbosity(tf.logging.INFO) if len(config_train.name) > 0:
args.multi_gpu = True else: args.multi_gpu = False label_cols = ["functionality", "range_anxiety", "availability", "cost", "ui", "location", "service_time", "dealership"] databunch = BertDataBunch(args['data_dir'], LABEL_PATH, args.model_name, train_file='train_final.csv', val_file='valid_final.csv', test_data='test_final.csv', text_col="review", label_col=label_cols, batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type) databunch.train_dl.dataset[0][3] num_labels = len(databunch.labels) print(num_labels) metrics = [] metrics.append({'name': 'accuracy', 'function': accuracy}) metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh}) metrics.append({'name': 'roc_auc', 'function': roc_auc}) metrics.append({'name': 'fbeta', 'function': fbeta}) print(device) MODEL_PATH = '../models/output/model_out/' LABEL_PATH = '.' predictor = BertClassificationPredictor( model_path=MODEL_PATH, label_path=LABEL_PATH, # location for labels.csv file multi_label=True, model_type='xlnet', do_lower_case=False) predictions = predictor.predict_batch(list(pd.read_csv('test_final.csv')['review'].values))
from fast_bert.prediction import BertClassificationPredictor from pathlib import Path DATA_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/data/') LABEL_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/labels/') MODEL_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/models/') LOG_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/logs/') # location for the pretrained BERT models BERT_PRETRAINED_PATH = Path( '../../bert_models/pretrained-weights/uncased_L-12_H-768_A-12/') predictor = BertClassificationPredictor(model_path=MODEL_PATH, pretrained_path=BERT_PRETRAINED_PATH, label_path=LABEL_PATH, multi_label=False) # Single prediction single_prediction = predictor.predict("just get me result for this text") # Batch predictions texts = ["this is the first text", "this is the second text"] multiple_predictions = predictor.predict(texts)
default=True) parser.add_argument( '--dataset', type=str, help= 'which dataset is used in Alexa topical dataset for testing, options can be train, valid_rare, valida_freq, test_freq, test_rare', required=True, choices=['train', 'valid_rare', 'valid_freq', 'test_freq', 'test_rare']) args = parser.parse_args() MODEL_DIR = args.model_dir #sys.argv[1] MODEL_PATH = path.join(MODEL_DIR, 'model_out') predictor = BertClassificationPredictor( model_path=MODEL_PATH, label_path=args.label_dir, #sys.argv[2], # directory for labels.csv file multi_label=False, model_type='bert', do_lower_case=True) INPUT = os.path.join('data', args.input_name + '.csv') texts = list(csv.reader(open(INPUT, 'rt'))) # sys.argv[3] batchsize = args.batch_size multiple_predictions = [] for i in tqdm(range(1, len(texts), batchsize)): batch_texts = [] if i + batchsize > len(texts): for j in range(i, len(texts)): batch_texts.append(texts[j][0]) tmp_pred = predictor.predict_batch(batch_texts) multiple_predictions.extend(tmp_pred) else: for j in range(i, i + batchsize):
'AVERAGE_2': { 'precission': 0.0, 'recall': 0.0, 'f1': 0.0 }, 'micro': { 'precission': 0.0, 'recall': 0.0, 'f1': 0.0 } #'micro_2': {'precission': 0.0, 'recall': 0.0, 'f1': 0.0} } predictor = BertClassificationPredictor(model_path=MODEL_PATH, label_path=LABEL_PATH, multi_label=False, model_type='bert', do_lower_case=False) if args.file_in[-3:] == 'csv': df_in = pd.read_csv(args.file_in, encoding='utf-8') truth_file = args.truth generate_metrics_report(df_in, truth_file, name=args.model_path) else: *_, truth_files = list(next(os.walk(args.truth))) *_, test_files = list(next(os.walk(args.file_in))) print(f'lenght of truth_files: {len(truth_files)}') print(f'lenght of test_files: {len(test_files)}') print(f'tests: {test_files}, \ntruth: {truth_files}') TEST_REPORT = pd.DataFrame()
from flask import Flask import pandas as pd from fast_bert.prediction import BertClassificationPredictor from flask import Flask, jsonify, request import re app = Flask(__name__) app.config.from_object(__name__) MODEL_PATH = 'model/' predictor = BertClassificationPredictor(model_path=MODEL_PATH, label_path='', multi_label=True, use_fast_tokenizer=False, model_type='bert', do_lower_case=False) @app.route('/predict', methods=['POST']) def predict(): if request.method == 'POST': texto = request.data s = re.sub('\W+', ' ', texto.decode('ASCII')) respuesta = predictor.predict(s.split('bertmedicalstring ')[1]) return jsonify({ 'Clase1': respuesta[0][0], 'Puntaje1': respuesta[0][1], 'Clase2': respuesta[1][0], 'Puntaje2': respuesta[1][1] })
# 获取 gpu 的数目 device = torch.device('cuda') if torch.cuda.device_count() > 1: args.multi_gpu = True else: args.multi_gpu = False # 设定索要的标签 label_cols = (sentence_labels if train_for == 'sentence' else fine_grained_labels) # 开始构建预测模型 predictor = BertClassificationPredictor(model_path=args.output_dir / 'model_out', label_path=LABEL_PATH, multi_label=True, model_type=args.model_type, do_lower_case=True) # 获取测试数据 output = predictor.predict_batch( list( pd.read_csv(str( DATA_PATH.joinpath('test.csv').absolute()))['text'].values)) # 将预测结果输出 pd.DataFrame(output).to_csv( str(DATA_PATH.joinpath('output_bert.csv').absolute())) # 预测结果读入 results = pd.read_csv(str(DATA_PATH.joinpath('output_bert.csv').absolute()))
from fast_bert.prediction import BertClassificationPredictor import pandas as pd import csv import json import copy predictor = BertClassificationPredictor( model_path='./Data/labor/data/model/keda/model_out', label_path='./Data/labor/new_data', multi_label=True, model_type='bert') text_list = list(pd.read_csv("./Data/labor/new_data/test.csv")['text'].values) output = predictor.predict_batch(text_list) print(output)
#!/usr/bin/env python # -*- coding: utf-8 -*- import os, sys import pandas as pd from fast_bert.prediction import BertClassificationPredictor import pickle import json MODEL_PATH = 'output/model_out/' predictor = BertClassificationPredictor( model_path=MODEL_PATH, label_path='./', # location for labels.csv file multi_label=False, model_type='bert', do_lower_case=False) #multi prediction test_data = pd.read_csv('dev.csv') x = 0 for item in test_data.text: prediction = predictor.predict(item) with open('predictions.tsv', 'a') as fp: #print(str(test_data.id[x]) + '\t' + prediction[0][0][0] + '\t' + prediction[0][0][1] + '\n') fp.write( str(test_data.id[x]) + '\t' + prediction[0][0][0] + '\t' + prediction[0][0][1] + '\n') x = x + 1
def main(model_uri: Param("S3 uri with NLP model", str), data_uri: Param("S3 uri with input csv file", str), result_uri: Param( "S3 uri where to put output csv file with added \ inference columns", str), inference_columns: Param( "text columns separated in the csv file on \ which inference will be run", str)): try: local_model = download_uri(model_uri) except: print(f"Failed to download NLP model. Exiting...") sys.exit(2) try: local_csv = download_uri(data_uri) except: print(f"Failed to download input csv file. Exiting...") sys.exit(2) model_dir = Path("/tmp/model") model_dir.mkdir(exist_ok=True) out = subprocess.Popen(['tar', 'xzf', local_model, '-C', model_dir], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = out.communicate() if not stderr: print("Model extacted sucessfully") else: print(stderr.decode('ascii')) print(f"Model extaction error. Exiting...") sys.exit(1) model_config = model_dir / 'model_config.json' with open(model_config) as f: config = json.load(f) print("Loading model") predictor = BertClassificationPredictor( model_path=str(model_dir / 'model_out'), label_path=str(model_dir), # location for labels.csv file model_type=config['model_type'], multi_label=config['multi_label'], do_lower_case=config['do_lower_case'], ) try: print("Loading input csv") df = pd.read_csv(local_csv) except: print("Failed to load input csv file. Exiting...") sys.exit(1) inference_columns = inference_columns.split(',') for c in inference_columns: if c not in df.columns: print(f"{c} is not a column name in input csv file. Exiting...") sys.exit(2) for c in inference_columns: print(f"Starting inference for {c} column") start = time.time() text = df.loc[~df[c].isna(), c].tolist() out = predictor.predict_batch(text) result = pd.DataFrame(list(map(dict, out))) for r in result.columns: df.loc[~df[c].isna(), f"{c}_{r}"] = result[r].tolist() print(f"Inference time for {len(text)} rows was {time.time() - start}") df.to_csv(local_csv, index=False) upload_uri(local_csv, result_uri) print("We are done with inference!")
def emotion_evaluation(path, arc_path=None, binarized=True, method=None): """ for test after finishing training """ #load emotion classifier LABEL_PATH = "emotion_classifier/" MODEL_PATH = "emotion_classifier/checkpoint/bert/model_out/" predictor = BertClassificationPredictor( model_path=MODEL_PATH, label_path=LABEL_PATH, # location for labels.csv file multi_label=True, model_type='bert', do_lower_case=True) # load and process generated file[] if os.path.exists("numpy_files_v3/generated_em_dist_rl_fine.npy"): print("Loading computed emotion dist for generated stories...") generated_emotion_scores = np.load( "numpy_files_v3/generated_em_dist_rl_fine.npy") else: print("Start loading and processing generated stories...") _all_text = [] with open(path) as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') for row in reader: # trim prefix context and suffix EOS txt = row[1].strip(" | ") ind = txt.find(" <|endoftext|>") txt = txt[:ind] if ind != -1 else txt _all_text.append(txt) clf_input = [] comet_input = [] for txt in _all_text: sample_story = nltk.sent_tokenize(txt) #should be list of len 5 comet_input.append(sample_story) # for some reason the model rarely generates not exactly 5 sentences if len(sample_story) == 0: sample_story = ["", "", ""] elif len(sample_story) > 5: sample_story = [ sample_story[0], ' '.join(sample_story[j] for j in range(1, 4)), sample_story[4] ] elif len(sample_story) > 1: sample_story = [ sample_story[0], ' '.join(j for j in sample_story[1:-1]), sample_story[-1] ] else: sample_story = [ sample_story[0], sample_story[0], sample_story[0] ] clf_input.append(sample_story[:5]) print("Start classifying generated stories...") generated_emotion_scores = get_emotion_dist( predictor, clf_input, preprint=False) # np array (data_size, 3 * 5) np.save("numpy_files_v3/generated_em_dist_rl_base_k40.npy", generated_emotion_scores) print("Classification finished !") if arc_path is not None: test_arc = [i.strip().split() for i in open(arc_path)] print("Start computing emotion probability score") emo_prob_score = get_emotion_prob(generated_emotion_scores, test_arc, batch_normalize=True) print("clf_prob score: ", emo_prob_score) metrics.update({"classifier probablity score: ": emo_prob_score}) if binarized: data_size = len(test_arc) generated_emotion_scores = np.reshape(generated_emotion_scores, (data_size, 3, -1)) generated_emotion_scores_bn = (generated_emotion_scores.max( axis=-1, keepdims=1) == generated_emotion_scores).astype(float) if os.path.exists(arc_path[:-4] + ".npy"): true_emotion_scores_bn = np.load(arc_path[:-4] + ".npy") else: true_emotion_scores_bn = np.zeros_like(generated_emotion_scores) assert (generated_emotion_scores.shape[:2] == (len(test_arc), len(test_arc[0]))) for i in range(true_emotion_scores_bn.shape[0]): for j in range(true_emotion_scores_bn.shape[1]): true_emotion_scores_bn[i][j][EMOTION_MAP[test_arc[i] [j]]] = 1.0 np.save(arc_path[:-4] + ".npy", true_emotion_scores_bn) arc_emotion_accuracy, seg_emotion_accuracy = compute_emotion_accuracy( generated_emotion_scores_bn, true_emotion_scores_bn) print("arc_emotion_accuracy: {}\n segment_emotion_accuracy: {}".format( arc_emotion_accuracy, seg_emotion_accuracy)) metrics.update({ "arc_acc": arc_emotion_accuracy, "segment_acc": seg_emotion_accuracy }) dic_dir = os.path.dirname(label_path) per_arc_accuracy = compute_per_arc_accuracy( generated_emotion_scores_bn, true_emotion_scores_bn, dic_dir) metrics.update(per_arc_accuracy) # compute comet-based emotion evaluation metric (Ec-Em) if arc_path is not None: test_arc_file = [i.strip().split() for i in open(arc_path)] print("Start generating comet inferences ...") comet_prediction = get_comet_prediction(comet_input) print("Finished generating comet inferences ...") comet_score = compute_edit_distance(comet_prediction, test_arc_file, batch_normalize=True) print("comet score: {}".format(comet_score)) metrics.update({"comet_score: ": comet_score}) return metrics