Example #1
0
def run(model, csvs, threshold, evaluation):
    labels = [
        "anger", "anticipation", "disgust", "fear", "joy", "love", "optimism",
        "pessimism", "sadness", "surprise", "trust", "neutral"
    ]

    predictor = BertClassificationPredictor(
        model_path=args.model_dir,
        label_path=
        "D:\\UTD\\Assignment\\NLP\\project\\",  # location for labels.csv file
        multi_label=False,
        model_type='bert',
        do_lower_case=False)

    inputs = {}
    ids = []
    data = pd.read_csv(csvs)
    # print(data.head())
    for idx, row in data.iterrows():
        temp = []
        for label in labels:
            if row[label] == 1:
                temp.append(label)
        inputs[row['text']] = temp
        ids.append(row['id'])

    multiple_predictions = predictor.predict_batch(list(inputs.keys()))
    outputs = []
    out_file = open(os.path.join(os.path.dirname(csvs), "model_output.csv"),
                    "w",
                    encoding="utf-8",
                    newline="")
    csv_writer = csv.writer(out_file)
    csv_writer.writerow(["id", "text", "emotions", "target"])

    for i, out in enumerate(multiple_predictions):
        temp = []
        for emotion in out:
            if emotion[1] > threshold:  # greater than threshold
                temp.append(emotion[0])
        csv_writer.writerow(
            [ids[i],
             list(inputs.keys())[i], temp,
             list(inputs.values())[i]])
        outputs.append(temp)

    print("****************\n")
    print("Predictions saved in a file: ",
          os.path.join(os.path.dirname(csvs), "model_output.csv"))
    if evaluation:
        print("\n\n Running Model Evaluation\n")
        y_true = list(inputs.values())
        y_pred = outputs
        y_true_encoded = MultiLabelBinarizer().fit_transform(y_true)
        y_pred_encoded = MultiLabelBinarizer().fit_transform(y_pred)
        pprint(classification_report(y_true_encoded, y_pred_encoded))
        pprint(
            classification_report(y_true_encoded,
                                  y_pred_encoded,
                                  target_names=labels))
Example #2
0
    def predict_proba(self, x):
        print('\tpredicting probabilities...')
        # create pandas dataframe
        df = pd.DataFrame({'comment_text': x})
        df.to_csv(self.experiment_parameters.DATA_PATH/'test.csv')

        # create predictor object
        output_dir = 'output-%d/model_out'%self.experiment_parameters.random_state
        predictor = BertClassificationPredictor(model_path=(self.experiment_parameters.MODEL_PATH/output_dir).absolute().as_posix(),
                                            label_path=self.experiment_parameters.LABEL_PATH,
                                            multi_label=True,
                                            model_type=self.experiment_parameters.MODEL_TYPE,
                                            do_lower_case=True)

        # predict test labels
        output = predictor.predict_batch(list(pd.read_csv(self.experiment_parameters.DATA_PATH/'test.csv')['comment_text'].values))

        # dump results
        pd.DataFrame(output).to_csv(self.experiment_parameters.PRED_PATH/self.experiment_parameters.RESULTS_FILENAME)

        # clean output
        preds = pd.DataFrame([{item[0]: item[1] for item in pred} for pred in output])
        print(preds.head(5))

        y_pred_prob = preds.values.reshape(-1)
        y_pred_prob = pd.Series(y_pred_prob)
        return y_pred_prob
Example #3
0
    def __init__(self, args):
        self.gen_model_type = args.gen_model_type
        self.gen_model_path = args.gen_model_path
        self.conv_line_path = args.conv_line_path
        self.gen_length = args.length
        self.temperature = args.temperature
        self.top_k = args.top_k
        self.top_p = args.top_p
        self.stop_token = args.stop_token
        self.repetition_penalty = args.repetition_penalty
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.gen_model_type = self.gen_model_type.lower()
        self.lookup = {
            '1': 'Fashion',
            '2': 'Politics',
            '3': 'Books',
            '4': 'Sports',
            '5': 'General Entertainment',
            '6': 'Music',
            '7': 'Science & Technology',
            '8': 'Movie',
            '9': 'General'
        }
        self.topic_cls = BertClassificationPredictor(
            model_path=args.topic_cls_path,
            label_path=args.
            label_dir,  #sys.argv[2], # directory for labels.csv file
            multi_label=False,
            model_type='bert',
            do_lower_case=True)

        self.entity_ext_model = AutoModelForTokenClassification.from_pretrained(
            "dbmdz/bert-large-cased-finetuned-conll03-english")
        self.entity_ext_model.to(self.device)
        self.entity_ext_tokenizer = AutoTokenizer.from_pretrained(
            "bert-base-cased")

        if self.gen_model_type == 'dialogpt':
            self.gen_tokenizer = AutoTokenizer.from_pretrained(
                self.gen_model_path)
            self.gen_model = AutoModelWithLMHead.from_pretrained(
                self.gen_model_path)
            self.gen_model.cuda()
            self.gen_model.eval()
        elif self.gen_model_type == 'bart':
            self.gen_model = BARTModel.from_pretrained(
                self.gen_model_path,
                checkpoint_file='checkpoint_best.pt',
                data_name_or_path=self.gen_model_path)
            self.gen_model.cuda()
            self.gen_model.eval()

        self.conv_line = BARTModel.from_pretrained(
            self.conv_line_path,
            checkpoint_file='checkpoint_best.pt',
            data_name_or_path=self.conv_line_path)
        self.conv_line.cuda()
        self.conv_line.eval()
Example #4
0
 def __init__(self, model_path, label_path):
     self.predictor = BertClassificationPredictor(
                     model_path=model_path,
                     label_path=label_path, # location for labels.csv file
                     multi_label=False,
                     model_type='bert',
                     do_lower_case=False)
     self.preprocessor = TextPreprocessor()
Example #5
0
 def get_predictor_model(cls):
     
     #print(cls.searching_all_files(PATH))
     # Get model predictor
     if cls.model == None:
         with open(PATH/'model_config.json') as f:
             model_config = json.load(f)
             
         predictor = BertClassificationPredictor(PATH/'model_out', label_path=PATH, )
         
         predictor = BertClassificationPredictor(PATH/'model_out', label_path=PATH, 
                                                 multi_label=bool(model_config['multi_label']), 
                                                 model_type=model_config['model_type'],
                                                 do_lower_case=bool(model_config['do_lower_case']))
         cls.model = predictor
     
     return cls.model
Example #6
0
class WebappConfig(AppConfig):
    name = 'fastbert'
    MODEL_PATH = pathlib.Path("model")
    BERT_PRETRAINED_PATH = pathlib.Path("model/uncased_L-12_H-768_A-12/")
    LABEL_PATH = pathlib.Path("label/")
    predictor = BertClassificationPredictor(
        model_path=MODEL_PATH / "multilabel-emotion-fastbert-basic.bin",
        pretrained_path=BERT_PRETRAINED_PATH,
        label_path=LABEL_PATH,
        multi_label=True)
Example #7
0
def get_predictor(train_for):
    # 开始构建预测模型
    output_dir = Path('./models/%s/output/model_out' % train_for)
    label_dir = Path('./labels/%s/' % train_for)
    predictor = BertClassificationPredictor(model_path=output_dir,
                                            label_path=label_dir,
                                            multi_label=True,
                                            model_type='bert',
                                            do_lower_case=True)
    return predictor
Example #8
0
 def predict(self, text):
     predictor = BertClassificationPredictor(
         model_path=self.in_dir + '/' + self.model_dir,
         label_path=self.in_dir + '/labels',  # location for labels.csv file
         multi_label=True,
         # model_type='xlnet',
         do_lower_case=True)
     prediction = predictor.predict(str(text))[:7]
     rst_list = []
     for i in range(len(prediction)):
         rst_list.append(" #" + str(prediction[i][0]))
     return rst_list
Example #9
0
def threshold(model, csvs):
    labels = [
        "anger", "anticipation", "disgust", "fear", "joy", "love", "optimism",
        "pessimism", "sadness", "surprise", "trust", "neutral"
    ]

    predictor = BertClassificationPredictor(
        model_path=args.model_dir,
        label_path=
        "D:\\UTD\\Assignment\\NLP\\project\\",  # location for labels.csv file
        multi_label=False,
        model_type='bert',
        do_lower_case=False)
    thresholds = [
        0.0005, 0.00077, 0.00079, 0.00083, 0.00087, 0.0009, 0.00093, 0.00095,
        0.00099, 0.001, 0.0012, 0.0015, 0.00155, 0.0016, 0.00166, 0.0017,
        0.0019, 0.002, 0.0021, 0.0023, 0.0025, 0.0028, 0.003, 0.0035, 0.0032,
        0.0037, 0.004, 0.0045, 0.0047, 0.0041, 0.005, 0.0053, 0.0055, 0.0062,
        0.009, 0.007, 0.01, 0.011, 0.013, 0.014, 0.012, 0.015, 0.02, 0.25,
        0.03, 0.035, 0.039
    ]
    # targets = []
    inputs = {}
    data = pd.read_csv(csvs)
    # print(data.head())
    for idx, row in data.iterrows():
        temp = []
        for label in labels:
            if row[label] == 1:
                temp.append(label)
        inputs[row['text']] = temp

    multiple_predictions = predictor.predict_batch(list(inputs.keys()))
    threshold_accs = {}

    for th in thresholds:
        correct = 0
        # print(list(inputs.values())[0])
        outputs = []
        for out in multiple_predictions:
            temp = []
            for emotion in out:
                if emotion[1] >= th:  # greater than threshold
                    temp.append(emotion[0])
            outputs.append(temp)
        # print(outputs[0])
        for i in range(len(inputs)):
            if (set(outputs[i]) == set(list(inputs.values())[i])):
                correct += 1
        print("Threshold: ", th, "Correct: ", correct)
        threshold_accs[str(th)] = correct / len(inputs)
    print(threshold_accs)
Example #10
0
    def __init__(self, model_dir, label_to_idx=label_to_idx):
        super().__init__()
        model_dir = Path(model_dir)
        model_config = model_dir / 'model_config.json'
        with open(model_config) as f:
            config = json.load(f)

        self.model = BertClassificationPredictor(
            model_path=str(model_dir / 'model_out'),
            label_path=str(model_dir),  # location for labels.csv file
            model_type=config['model_type'],
            multi_label=config['multi_label'],
            do_lower_case=config['do_lower_case'],
        )
        self.label_to_idx = label_to_idx
Example #11
0
class WebappConfig(AppConfig):
    name = 'webapp'
    pwd = os.getcwd()
    HOME = os.environ["HOME"]
    #MODEL_PATH = Path("required_files")
    #BERT_PRETRAINED_PATH = Path(pwd+"/webapp/required_files/bert_model/uncased_L-12_H-768_A-12")
    #LABEL_PATH = Path(pwd+"/webapp/labels/")
    #predictor = BertClassificationPredictor(model_path=pwd+"/webapp/required_files/emotion_try.bin", pretrained_path=BERT_PRETRAINED_PATH, label_path=LABEL_PATH, multi_label=True)
    LABEL_PATH = Path(pwd + "/webapp/labels/")
    MODEL_PATH = Path(pwd + "/webapp/model_out_roberta")
    predictor = BertClassificationPredictor(
        model_path=MODEL_PATH,
        label_path=LABEL_PATH,  # location for labels.csv file
        multi_label=False,
        model_type='roberta',
        do_lower_case=True)
def classify_bert(text, model_path):
    """Classify genre using fast-bert.

    Fast-bert automatically uses GPU if `torch.cuda.is_available() == True`

    Parameters
    -----------
    text : <str or list(str)> for single prediction or multiprediction 
    model_path : <str> must contain labels.csv (I've put one in the uploaded version)
            AND all model files (config.json, pytorch_model.bin, special_tokens_map.json, tokenizer_config.json, vocab.txt)

    Returns
    ---------
    str: if type(text) == str
    list: if type(text) == list or numpy array

    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    predictor = BertClassificationPredictor(
        model_path=model_path,
        label_path=model_path,  # location for labels.csv file
        multi_label=True,
        model_type='bert',
        do_lower_case=False)
    # predictor.to(device)

    if isinstance(text, str):
        # Single prediction
        pred = predictor.predict(text)
        pred = dict(pred)
        # single_prediction = predictor.predict("just get me result for this text")
    elif isinstance(text, list) or isinstance(text, np.ndarray):
        pred = predictor.predict_batch(text)
        # # Batch predictions
        # texts = [
        #     "this is the first text",
        #     "this is the second text"
        #     ]
        for i in range(len(pred)):
            pred[i] = dict(pred[i])

        # multiple_predictions = predictor.predict_batch(texts)
    else:
        raise ValueError("Unexpected type for input argument `text`")
    return pred
Example #13
0
    def get_predictor_model(cls):

        # print(cls.searching_all_files(PATH))
        # Get model predictor
        if cls.model is None:
            with open(os.path.join(PATH, "model_config.json")) as f:
                model_config = json.load(f)

            predictor = BertClassificationPredictor(
                os.path.join(PATH, "model_out"),
                label_path=PATH,
                multi_label=bool(model_config["multi_label"]),
                model_type=model_config["model_type"],
                do_lower_case=bool(model_config["do_lower_case"]),
            )
            cls.model = predictor

        return cls.model
def prediction(text_list: list, case_type: str):
    if case_type == 'divorce':
        model_path = '/home/zf/lyy/Data/divorce/data/model/kedaV3/model_out'
        label_path = '/home/zf/lyy/Data/divorce/data/new_data'
    elif case_type == 'loan':
        model_path = '/home/zf/lyy/Data/loan/data/model/keda/model_out'
        label_path = '/home/zf/lyy/Data/loan/new_data'
    elif case_type == 'labor':
        model_path = '/home/zf/lyy/Data/labor/data/model/keda/model_out'
        label_path = '/home/zf/lyy/Data/labor/new_data'
    else:
        raise Exception('No this type')
    tag_dict = build_tags_dict(case_type)
    predictor = BertClassificationPredictor(model_path=model_path,
                                            label_path=label_path,
                                            multi_label=True,
                                            model_type='bert')
    output = predictor.predict_batch(text_list)

    detail_dic = {}
    result_dic = {}
    for i in range(len(output)):
        lab_list = []
        for key in output[i]:
            if float(key[1]) > 0.5:
                if key[0] not in result_dic.keys():
                    result_dic[key[0]] = 1
                else:
                    result_dic[key[0]] += 1
                if '21' not in key[0]:
                    lab_list.append(tag_dict[key[0]])
        detail_dic[text_list[i]] = lab_list
    final_result_dic = {}
    for key in result_dic.keys():
        if '21' not in key:
            final_result_dic[tag_dict[key]] = result_dic[key]
    return final_result_dic, detail_dic
Example #15
0
def predict_bert(experiment_parameters):
    # create predictor object
    predictor = BertClassificationPredictor(model_path=(experiment_parameters.MODEL_PATH/'output/model_out').absolute().as_posix(),
                                        label_path=experiment_parameters.LABEL_PATH,
                                        multi_label=True,
                                        model_type=experiment_parameters.MODEL_TYPE,
                                        do_lower_case=True)

    # predict test labels
    output = predictor.predict_batch(list(pd.read_csv(experiment_parameters.DATA_PATH/'test.csv')['comment_text'].values))

    # dump results
    pd.DataFrame(output).to_csv(experiment_parameters.PRED_PATH/experiment_parameters.RESULTS_FILENAME)

    # clean output
    preds = pd.DataFrame([{item[0]: item[1] for item in pred} for pred in output])
    print(preds.head())

    # load test data
    df_test = pd.read_csv(experiment_parameters.DATA_PATH/'test.csv')
    print(df_test.head())

    # merge dataframes
    df_pred = pd.merge(df_test, preds, how='left', left_index=True, right_index=True)
    del df_pred['comment_text']

    #df_pred = df_pred['id', 'obscene']
    df_pred['ground_truth'] = df_pred['%s_x'%LABEL_COLS[0]]
    df_pred['pred_prob'] = df_pred['%s_y'%LABEL_COLS[0]]
    del df_pred['%s_x'%LABEL_COLS[0]]
    del df_pred['%s_y'%LABEL_COLS[0]]
    print(df_pred.head())

    # write results to file
    df_pred.to_csv(experiment_parameters.PRED_PATH/experiment_parameters.RESULTS_FILENAME, index=None)
    return
Example #16
0
    return app


if __name__ == '__main__':
    path = 'models/model_out/pytorch_model.bin'
    bucket_path = 'https://storage.cloud.google.com/boast-trained-models/activity_classifier/pytorch_model.bin'

    # fetch model from google storage if not exist
    if bucket_path is not None and not os.path.exists(path):
        # set env key
        if 'GOOGLE_APPLICATION_CREDENTIALS' not in os.environ:
            os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'gcp_auth.json'

        client = storage.Client()
        bucket = client.get_bucket('boast-trained-models')
        blob = bucket.get_blob('activity_classifier/pytorch_model.bin')

        print('Downloading model...')
        with open(path, 'wb') as file_obj:
            blob.download_to_file(file_obj)

    predictor = BertClassificationPredictor(
        model_path='models/model_out',
        label_path='train',
        multi_label=False,
        model_type='distilbert',
        do_lower_case=True)

    serve(create_app(predictor), host='0.0.0.0', port=5000)
    #     if eos_token_id is not None:
    #         input_ids[i] = input_ids[i] + [eos_token_id]

    length = [len(ids) for ids in input_ids]
    max_length = max(length)
    for i in range(len(input_ids)):
        while len(input_ids[i]) < max_length:
            input_ids[i].append(eos_token_id)

    return np.array(input_ids), np.array(length)


# load trained emotion classifier
predictor = BertClassificationPredictor(
    model_path=FLAGS.clf_output_dir,
    label_path=FLAGS.clf_label_dir,  # location for labels.csv file
    multi_label=True,
    model_type='bert',
    do_lower_case=True)


def main(_):
    """
    Builds the model and runs
    """
    if FLAGS.distributed:
        import horovod.tensorflow as hvd
        hvd.init()

    tf.logging.set_verbosity(tf.logging.INFO)

    if len(config_train.name) > 0:
    args.multi_gpu = True
else:
    args.multi_gpu = False

label_cols = ["functionality", "range_anxiety", "availability", "cost", "ui", "location", "service_time", "dealership"]

databunch = BertDataBunch(args['data_dir'], LABEL_PATH, args.model_name, train_file='train_final.csv', val_file='valid_final.csv',
                          test_data='test_final.csv',
                          text_col="review", label_col=label_cols,
                          batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], 
                          multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type)

databunch.train_dl.dataset[0][3]
num_labels = len(databunch.labels)
print(num_labels)
metrics = []
metrics.append({'name': 'accuracy', 'function': accuracy})
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'fbeta', 'function': fbeta})
print(device)
MODEL_PATH = '../models/output/model_out/'
LABEL_PATH = '.'
predictor = BertClassificationPredictor(
				model_path=MODEL_PATH,
				label_path=LABEL_PATH, # location for labels.csv file
				multi_label=True,
				model_type='xlnet',
				do_lower_case=False)
predictions = predictor.predict_batch(list(pd.read_csv('test_final.csv')['review'].values))
from fast_bert.prediction import BertClassificationPredictor
from pathlib import Path

DATA_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/data/')
LABEL_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/labels/')
MODEL_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/models/')
LOG_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/logs/')

# location for the pretrained BERT models
BERT_PRETRAINED_PATH = Path(
    '../../bert_models/pretrained-weights/uncased_L-12_H-768_A-12/')

predictor = BertClassificationPredictor(model_path=MODEL_PATH,
                                        pretrained_path=BERT_PRETRAINED_PATH,
                                        label_path=LABEL_PATH,
                                        multi_label=False)

# Single prediction
single_prediction = predictor.predict("just get me result for this text")

# Batch predictions
texts = ["this is the first text", "this is the second text"]

multiple_predictions = predictor.predict(texts)
Example #20
0
                    default=True)
parser.add_argument(
    '--dataset',
    type=str,
    help=
    'which dataset is used in Alexa topical dataset for testing, options can be train, valid_rare, valida_freq, test_freq, test_rare',
    required=True,
    choices=['train', 'valid_rare', 'valid_freq', 'test_freq', 'test_rare'])
args = parser.parse_args()
MODEL_DIR = args.model_dir  #sys.argv[1]

MODEL_PATH = path.join(MODEL_DIR, 'model_out')

predictor = BertClassificationPredictor(
    model_path=MODEL_PATH,
    label_path=args.label_dir,  #sys.argv[2], # directory for labels.csv file
    multi_label=False,
    model_type='bert',
    do_lower_case=True)
INPUT = os.path.join('data', args.input_name + '.csv')
texts = list(csv.reader(open(INPUT, 'rt')))  # sys.argv[3]
batchsize = args.batch_size
multiple_predictions = []
for i in tqdm(range(1, len(texts), batchsize)):
    batch_texts = []
    if i + batchsize > len(texts):
        for j in range(i, len(texts)):
            batch_texts.append(texts[j][0])
        tmp_pred = predictor.predict_batch(batch_texts)
        multiple_predictions.extend(tmp_pred)
    else:
        for j in range(i, i + batchsize):
Example #21
0
        'AVERAGE_2': {
            'precission': 0.0,
            'recall': 0.0,
            'f1': 0.0
        },
        'micro': {
            'precission': 0.0,
            'recall': 0.0,
            'f1': 0.0
        }
        #'micro_2': {'precission': 0.0, 'recall': 0.0, 'f1': 0.0}
    }

    predictor = BertClassificationPredictor(model_path=MODEL_PATH,
                                            label_path=LABEL_PATH,
                                            multi_label=False,
                                            model_type='bert',
                                            do_lower_case=False)

    if args.file_in[-3:] == 'csv':
        df_in = pd.read_csv(args.file_in, encoding='utf-8')
        truth_file = args.truth
        generate_metrics_report(df_in, truth_file, name=args.model_path)
    else:
        *_, truth_files = list(next(os.walk(args.truth)))
        *_, test_files = list(next(os.walk(args.file_in)))
        print(f'lenght of truth_files: {len(truth_files)}')
        print(f'lenght of test_files: {len(test_files)}')
        print(f'tests: {test_files}, \ntruth: {truth_files}')

        TEST_REPORT = pd.DataFrame()
Example #22
0
from flask import Flask
import pandas as pd
from fast_bert.prediction import BertClassificationPredictor
from flask import Flask, jsonify, request
import re

app = Flask(__name__)
app.config.from_object(__name__)

MODEL_PATH = 'model/'

predictor = BertClassificationPredictor(model_path=MODEL_PATH,
                                        label_path='',
                                        multi_label=True,
                                        use_fast_tokenizer=False,
                                        model_type='bert',
                                        do_lower_case=False)


@app.route('/predict', methods=['POST'])
def predict():
    if request.method == 'POST':
        texto = request.data
        s = re.sub('\W+', ' ', texto.decode('ASCII'))
        respuesta = predictor.predict(s.split('bertmedicalstring ')[1])
        return jsonify({
            'Clase1': respuesta[0][0],
            'Puntaje1': respuesta[0][1],
            'Clase2': respuesta[1][0],
            'Puntaje2': respuesta[1][1]
        })
Example #23
0
# 获取 gpu 的数目
device = torch.device('cuda')
if torch.cuda.device_count() > 1:
    args.multi_gpu = True
else:
    args.multi_gpu = False

# 设定索要的标签
label_cols = (sentence_labels
              if train_for == 'sentence' else fine_grained_labels)

# 开始构建预测模型
predictor = BertClassificationPredictor(model_path=args.output_dir /
                                        'model_out',
                                        label_path=LABEL_PATH,
                                        multi_label=True,
                                        model_type=args.model_type,
                                        do_lower_case=True)

# 获取测试数据
output = predictor.predict_batch(
    list(
        pd.read_csv(str(
            DATA_PATH.joinpath('test.csv').absolute()))['text'].values))

# 将预测结果输出
pd.DataFrame(output).to_csv(
    str(DATA_PATH.joinpath('output_bert.csv').absolute()))

# 预测结果读入
results = pd.read_csv(str(DATA_PATH.joinpath('output_bert.csv').absolute()))
Example #24
0
from fast_bert.prediction import BertClassificationPredictor
import pandas as pd
import csv
import json
import copy

predictor = BertClassificationPredictor(
    model_path='./Data/labor/data/model/keda/model_out',
    label_path='./Data/labor/new_data',
    multi_label=True,
    model_type='bert')

text_list = list(pd.read_csv("./Data/labor/new_data/test.csv")['text'].values)
output = predictor.predict_batch(text_list)

print(output)
Example #25
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os, sys
import pandas as pd
from fast_bert.prediction import BertClassificationPredictor
import pickle
import json

MODEL_PATH = 'output/model_out/'

predictor = BertClassificationPredictor(
    model_path=MODEL_PATH,
    label_path='./',  # location for labels.csv file
    multi_label=False,
    model_type='bert',
    do_lower_case=False)

#multi prediction
test_data = pd.read_csv('dev.csv')

x = 0
for item in test_data.text:
    prediction = predictor.predict(item)
    with open('predictions.tsv', 'a') as fp:
        #print(str(test_data.id[x]) + '\t' + prediction[0][0][0] + '\t' + prediction[0][0][1] + '\n')
        fp.write(
            str(test_data.id[x]) + '\t' + prediction[0][0][0] + '\t' +
            prediction[0][0][1] + '\n')
    x = x + 1
Example #26
0
def main(model_uri: Param("S3 uri with NLP model", str),
         data_uri: Param("S3 uri with input csv file", str), result_uri: Param(
             "S3 uri where to put output csv file with added \
                                inference columns",
             str), inference_columns: Param(
                 "text columns separated in the csv file on \
                        which inference will be run", str)):
    try:
        local_model = download_uri(model_uri)
    except:
        print(f"Failed to download NLP model. Exiting...")
        sys.exit(2)

    try:
        local_csv = download_uri(data_uri)
    except:
        print(f"Failed to download input csv file. Exiting...")
        sys.exit(2)

    model_dir = Path("/tmp/model")
    model_dir.mkdir(exist_ok=True)

    out = subprocess.Popen(['tar', 'xzf', local_model, '-C', model_dir],
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE)

    stdout, stderr = out.communicate()
    if not stderr:
        print("Model extacted sucessfully")
    else:
        print(stderr.decode('ascii'))
        print(f"Model extaction error. Exiting...")
        sys.exit(1)

    model_config = model_dir / 'model_config.json'
    with open(model_config) as f:
        config = json.load(f)

    print("Loading model")

    predictor = BertClassificationPredictor(
        model_path=str(model_dir / 'model_out'),
        label_path=str(model_dir),  # location for labels.csv file
        model_type=config['model_type'],
        multi_label=config['multi_label'],
        do_lower_case=config['do_lower_case'],
    )
    try:
        print("Loading input csv")
        df = pd.read_csv(local_csv)
    except:
        print("Failed to load input csv file. Exiting...")
        sys.exit(1)

    inference_columns = inference_columns.split(',')
    for c in inference_columns:
        if c not in df.columns:
            print(f"{c} is not a column name in input csv file. Exiting...")
            sys.exit(2)

    for c in inference_columns:

        print(f"Starting inference for {c} column")

        start = time.time()

        text = df.loc[~df[c].isna(), c].tolist()

        out = predictor.predict_batch(text)
        result = pd.DataFrame(list(map(dict, out)))
        for r in result.columns:
            df.loc[~df[c].isna(), f"{c}_{r}"] = result[r].tolist()

        print(f"Inference time for {len(text)} rows was {time.time() - start}")

    df.to_csv(local_csv, index=False)

    upload_uri(local_csv, result_uri)

    print("We are done with inference!")
Example #27
0
def emotion_evaluation(path, arc_path=None, binarized=True, method=None):
    """
    for test after finishing training
    """

    #load emotion classifier
    LABEL_PATH = "emotion_classifier/"
    MODEL_PATH = "emotion_classifier/checkpoint/bert/model_out/"

    predictor = BertClassificationPredictor(
        model_path=MODEL_PATH,
        label_path=LABEL_PATH,  # location for labels.csv file
        multi_label=True,
        model_type='bert',
        do_lower_case=True)

    # load and process generated file[]
    if os.path.exists("numpy_files_v3/generated_em_dist_rl_fine.npy"):
        print("Loading computed emotion dist for generated stories...")
        generated_emotion_scores = np.load(
            "numpy_files_v3/generated_em_dist_rl_fine.npy")
    else:
        print("Start loading and processing generated stories...")
        _all_text = []
        with open(path) as tsvfile:
            reader = csv.reader(tsvfile, delimiter='\t')
            for row in reader:
                # trim prefix context and suffix EOS
                txt = row[1].strip(" | ")
                ind = txt.find(" <|endoftext|>")
                txt = txt[:ind] if ind != -1 else txt
                _all_text.append(txt)

        clf_input = []
        comet_input = []
        for txt in _all_text:
            sample_story = nltk.sent_tokenize(txt)  #should be list of len 5
            comet_input.append(sample_story)
            # for some reason the model rarely generates not exactly 5 sentences
            if len(sample_story) == 0:
                sample_story = ["", "", ""]
            elif len(sample_story) > 5:
                sample_story = [
                    sample_story[0],
                    ' '.join(sample_story[j] for j in range(1, 4)),
                    sample_story[4]
                ]
            elif len(sample_story) > 1:
                sample_story = [
                    sample_story[0], ' '.join(j for j in sample_story[1:-1]),
                    sample_story[-1]
                ]
            else:
                sample_story = [
                    sample_story[0], sample_story[0], sample_story[0]
                ]

            clf_input.append(sample_story[:5])

        print("Start classifying generated stories...")
        generated_emotion_scores = get_emotion_dist(
            predictor, clf_input,
            preprint=False)  # np array (data_size, 3 * 5)
        np.save("numpy_files_v3/generated_em_dist_rl_base_k40.npy",
                generated_emotion_scores)
        print("Classification finished !")

    if arc_path is not None:
        test_arc = [i.strip().split() for i in open(arc_path)]
        print("Start computing emotion probability score")
        emo_prob_score = get_emotion_prob(generated_emotion_scores,
                                          test_arc,
                                          batch_normalize=True)
        print("clf_prob score: ", emo_prob_score)
        metrics.update({"classifier probablity score: ": emo_prob_score})

    if binarized:
        data_size = len(test_arc)
        generated_emotion_scores = np.reshape(generated_emotion_scores,
                                              (data_size, 3, -1))

        generated_emotion_scores_bn = (generated_emotion_scores.max(
            axis=-1, keepdims=1) == generated_emotion_scores).astype(float)

        if os.path.exists(arc_path[:-4] + ".npy"):
            true_emotion_scores_bn = np.load(arc_path[:-4] + ".npy")
        else:
            true_emotion_scores_bn = np.zeros_like(generated_emotion_scores)
            assert (generated_emotion_scores.shape[:2] == (len(test_arc),
                                                           len(test_arc[0])))
            for i in range(true_emotion_scores_bn.shape[0]):
                for j in range(true_emotion_scores_bn.shape[1]):
                    true_emotion_scores_bn[i][j][EMOTION_MAP[test_arc[i]
                                                             [j]]] = 1.0

            np.save(arc_path[:-4] + ".npy", true_emotion_scores_bn)

        arc_emotion_accuracy, seg_emotion_accuracy = compute_emotion_accuracy(
            generated_emotion_scores_bn, true_emotion_scores_bn)
        print("arc_emotion_accuracy: {}\n segment_emotion_accuracy: {}".format(
            arc_emotion_accuracy, seg_emotion_accuracy))
        metrics.update({
            "arc_acc": arc_emotion_accuracy,
            "segment_acc": seg_emotion_accuracy
        })

        dic_dir = os.path.dirname(label_path)
        per_arc_accuracy = compute_per_arc_accuracy(
            generated_emotion_scores_bn, true_emotion_scores_bn, dic_dir)
        metrics.update(per_arc_accuracy)

    # compute comet-based emotion evaluation metric (Ec-Em)
    if arc_path is not None:
        test_arc_file = [i.strip().split() for i in open(arc_path)]
        print("Start generating comet inferences ...")
        comet_prediction = get_comet_prediction(comet_input)
        print("Finished generating comet inferences ...")
        comet_score = compute_edit_distance(comet_prediction,
                                            test_arc_file,
                                            batch_normalize=True)
        print("comet score: {}".format(comet_score))
        metrics.update({"comet_score: ": comet_score})

    return metrics