Beispiel #1
0
    def __init__(self,
                 language,
                 label2ind,
                 student_args,
                 tokenizer=None,
                 manual_seed=1993):
        print("Compiling Student")
        self.name = student_args['model_name']
        self.num_labels = len(label2ind)
        self.label2ind = label2ind
        self.manual_seed = manual_seed
        self.language = language

        if self.name == 'bert':
            bert_name = 'camembert' if language == 'french' else 'bert'
            student_args['model_name'] = bert_name
            if self.language == 'japanese':
                student_args["use_multiprocessing"] = False
            self.clf = ClassificationModel(bert_name,
                                           monolingualbert[self.language],
                                           num_labels=self.num_labels,
                                           args=student_args)
            if self.language == 'japanese':
                from transformers import AutoTokenizer, AutoModel
                japanese_tokenizer = AutoTokenizer.from_pretrained(
                    'bert-base-japanese')
                japanese_model = AutoModel.from_pretrained(
                    'bert-base-japanese')
                self.clf.tokenizer = japanese_tokenizer
                self.clf.model.bert = japanese_model
                model_to_save = self.clf.model.module if hasattr(
                    self.clf.model, "module") else self.clf.model
                model_to_save.save_pretrained(self.clf.args["output_dir"])
                self.clf.tokenizer.save_pretrained(self.clf.args["output_dir"])
        elif self.name == 'mbert':
            student_args['model_name'] = 'bert'
            self.clf = ClassificationModel('bert',
                                           'bert-base-multilingual-cased',
                                           num_labels=self.num_labels,
                                           args=student_args)
        elif self.name == 'logreg':
            self.tokenizer = tokenizer
            if not self.tokenizer:
                raise (Exception(
                    "Need to define tokenizer for student={}".format(
                        self.name)))
            self.vectorizer = TfidfVectorizer(sublinear_tf=True,
                                              min_df=5,
                                              max_df=0.9,
                                              norm='l2',
                                              ngram_range=(1, 2),
                                              analyzer='word',
                                              tokenizer=identity_fn,
                                              preprocessor=identity_fn,
                                              token_pattern=None)
            self.clf = LogisticRegression(random_state=self.manual_seed,
                                          max_iter=int(1e6))
    def __init__(self, path: str):
        """Create/Load a new Model

        Args:
            path (str): if this path exists, then load the model, otherwise it will create a new model at the path.
        """
        self.path = path
        try:
            self.model = ClassificationModel("distilbert", path, use_cuda=False)
        except:
            self.model = ClassificationModel("distilbert", "distilbert-base-uncased", num_labels=len(relation_list), use_cuda=False)
Beispiel #3
0
def get_bert_base(train_sequences,
                  dev_sequences,
                  train_targets,
                  dev_targets,
                  time_constraint=1,
                  num_cpu=1,
                  max_features=1000,
                  model="bert-base",
                  weights_dir="transformers_trained",
                  cuda=False):

    'text' 'labels'
    total_sequences_training = train_sequences.values.tolist(
    ) + dev_sequences.values.tolist()

    total_labels_training = train_targets.tolist() + dev_targets.tolist()

    train_df = pd.DataFrame()
    train_df['text'] = total_sequences_training
    train_df['labels'] = total_labels_training

    # Create a ClassificationModel
    if model == "bert-base":
        model = ClassificationModel('bert',
                                    'bert-base-cased',
                                    num_labels=len(set(total_labels_training)),
                                    args={
                                        'reprocess_input_data': True,
                                        'overwrite_output_dir': True,
                                        "output_hidden_states": True
                                    },
                                    use_cuda=cuda)

    elif model == "roberta-base":
        model = ClassificationModel('roberta',
                                    'roberta-base',
                                    num_labels=len(set(total_labels_training)),
                                    args={
                                        'output_hidden_states': True,
                                        'reprocess_input_data': True,
                                        'overwrite_output_dir': True
                                    },
                                    use_cuda=cuda)

    model.args['num_train_epochs'] = 1
    model.args['max_sequence_length'] = 256
    model.args['save_eval_checkpoints'] = False
    model.args['save_model_every_epoch'] = False
    model.args['output_dir'] = weights_dir
    model.args['save_steps'] = 400

    # Train the model
    model.train_model(train_df)
    return model
 def build_model(self):
     if self.load_from_save and self.model_path:
         model = ClassificationModel('roberta',
                                     self.model_path,
                                     use_cuda=self.CUDA,
                                     num_labels=num_labels)
     else:
         model = ClassificationModel('roberta',
                                     'roberta-base',
                                     use_cuda=self.CUDA,
                                     num_labels=num_labels)
     return model
Beispiel #5
0
def main(path, valid_in_cat_path, valid_out_of_cat_path):
    steam_df = load_steam_data()
    i = 1
    print("starting training, using fold " + str(i))

    train, test = load_fold_data(path, i)
    # Train the model using roberta model
    args_dict = {'output_dir': '../../models/roberta-base-bs8-e6-fold' + str(i),
                 'use_cached_eval_features': False,
                 'reprocess_input_data': True,
                 'train_batch_size': 8,
                 'num_train_epochs': 6,
                 'fp16': False,
                 'overwrite_output_dir': True}
    model = ClassificationModel('roberta', 'roberta-base', num_labels=2, args=args_dict)
    model.train_model(train)
    print("done training model fold " + str(i))
    result, model_outputs, wrong_predictions = model.eval_model(test, acc=accuracy_score, f1=f1_score)
    acc = result['acc']
    f1 = result['f1']
    print(f"acc: {acc} , f1: {f1}")

    # Make predictions with the model
    save_path = '../../reports/steam-prediction.csv'
    print("predicting...")
    predictions, raw_outputs = model.predict(steam_df["sentence"].tolist())
    print(f"predicting finished - saved to {save_path}" )
    steam_df['prediction'] = predictions
    steam_df.to_csv(save_path, index=False)
    def __init__(self,
                 model_type: str,
                 model_name_or_path: Union[str, Path],
                 output_dir: Path,
                 class_weights: Optional[List[float]] = None
                 ):
        print('class weights: {}'.format(class_weights))
        self.output_dir = output_dir
        self.cache_dir = output_dir / 'cache/'
        self.tensorboard_dir = output_dir / 'runs/'
        self.best_model_dir = output_dir / 'output/best_model/'

        self.model_type = model_type
        self.model_name_or_path = model_name_or_path

        self.model = ClassificationModel(self.model_type,
                                         str(self.model_name_or_path),
                                         cache_dir='/media/sarthak/HDD/data_science/fnp_resources/pretrained_models/',
                                         args={'fp_16': True,
                                               'output_dir': str(self.output_dir),
                                               'cache_dir': str(self.cache_dir),
                                               'tensorboard_dir': str(self.tensorboard_dir),
                                               'best_model_dir': str(self.best_model_dir)},
                                         weight=class_weights
                                         )

        self.class_weights = class_weights
def main():
    f_path = 'Breast Cancer(Raw_data_2_Classes).csv'
    data = loadDataAsDataFrame(f_path)
    X = data
    y = data['Class'].tolist()
    training_set_size = int(0.8 * len(X))
    training_rows, test_rows, training_classes, test_classes = train_test_split(
        X, y, train_size=training_set_size, random_state=42069)
    training_rows, test_rows, training_classes, test_classes = train_test_split(
        X, y, train_size=training_set_size, random_state=42069)
    model_args = {'overwrite_output_dir': True}
    # Create a TransformerModel
    model = ClassificationModel('roberta',
                                'roberta-base',
                                use_cuda=False,
                                args=model_args)
    #model = ClassificationModel('roberta', 'roberta-base', use_cuda=True, args=model_args)

    #change our data into a format that simpletransformers can process
    training_rows['text'] = training_rows['Text']
    training_rows['labels'] = training_rows['Class']
    test_rows['text'] = test_rows['Text']
    test_rows['labels'] = test_rows['Class']

    # Train the model
    model.train_model(training_rows)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(test_rows)

    print("f1 score")
    precision = result['tp'] / (result['tp'] + result['fp'])
    recall = result['tp'] / (result['tp'] + result['fn'])
    f1score = 2 * precision * recall / (precision + recall)
    print(f1score)
def test_binary_classification(model_type, model_name):
    # Train and Evaluation data needs to be in a Pandas Dataframe of two columns.
    # The first column is the text with type str, and the second column is the
    # label with type int.
    train_data = [
        ["Example sentence belonging to class 1", 1],
        ["Example sentence belonging to class 0", 0],
    ]
    train_df = pd.DataFrame(train_data)

    eval_data = [
        ["Example eval sentence belonging to class 1", 1],
        ["Example eval sentence belonging to class 0", 0],
    ]
    eval_df = pd.DataFrame(eval_data)

    # Create a ClassificationModel
    model = ClassificationModel(
        model_type,
        model_name,
        use_cuda=False,
        args={
            "reprocess_input_data": True,
            "overwrite_output_dir": True
        },
    )

    # Train the model
    model.train_model(train_df)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(eval_df)
Beispiel #9
0
def train(train_df, max_sub_len, output_dir):
    model_type = 'distilbert'
    lr = 2e-5
    sent_length = max_sub_len
    OUTPUT_DIR = output_dir \
                 + str(datetime.datetime.now())[:19] + '_' + model_type + '_' + str(sent_length) + '_' + str(lr)
    print("model is saved at: {}".format(OUTPUT_DIR))
    training_config = {
        'output_dir': OUTPUT_DIR,
        'reprocess_input_data': True,
        'overwrite_output_dir': True,
        'num_train_epochs': 2,
        'train_batch_size': 32,
        'eval_batch_size': 32,
        'learning_rate': lr,
        'max_seq_length': sent_length
    }
    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)
    model = ClassificationModel(model_type,
                                'distilbert-base-cased',
                                num_labels=4,
                                args=training_config)
    torch.cuda.empty_cache()
    model.train_model(train_df)
    return model
Beispiel #10
0
    def train(self, split=0.7, num_epochs=10):
        self.le = preprocessing.LabelEncoder()
        print(list(self.dat.keys()))
        self.le.fit(list(self.dat.keys()))

        train_data = []
        eval_data = []
        for k, v in self.dat.items():
            len_train = int(round(len(v) * split))
            train_data.extend([[i, self.le.transform([k])[0]]
                               for i in v[:len_train]])

            eval_data.extend([[i, self.le.transform([k])[0]]
                              for i in v[len_train:]])

        print(train_data, eval_data)
        train_df = pd.DataFrame(train_data)
        eval_df = pd.DataFrame(eval_data)
        train_args = {
            'overwrite_output_dir': True,
            'num_train_epochs': num_epochs,
        }
        self.model = ClassificationModel(self.model_type,
                                         self.model_name,
                                         num_labels=len(list(self.dat.keys())),
                                         use_cuda=self.use_cuda,
                                         cuda_device=0,
                                         args=train_args)
        # Train the model
        self.model.train_model(train_df, eval_df=eval_df)

        # Evaluate the model
        result, model_outputs, wrong_predictions = self.model.eval_model(
            eval_df, acc=sklearn.metrics.accuracy_score)
Beispiel #11
0
    def train(self, train_data: object, eval_data: object) -> object:
        """
        Create and train the chosen model based on the args

        Parameters
        ----------
        train_data : object
            train split of the train_data.
        eval_data : object
            validation split of the train_data.

        Returns
        -------
        object
            model.

        """

        # Create a ClassificationModel
        model = ClassificationModel(
            self.model_name,
            self.model_type,
            args=self.model_args,
            use_cuda=self.cuda,
            num_labels=len(self.labels) - 1,
        )
        # Train the model
        model.train_model(train_df=train_data,
                          eval_df=eval_data,
                          accuracy=accuracy_score)
        return model
Beispiel #12
0
    def train(self, args={}, cleanFN=CleanText().cleanText):
        self.logger.debug("Train Simpletransformer")
        isCudaAvailable = torch.cuda.is_available()

        if not isCudaAvailable:
            self.logger.warning("Training on CPU!")

        _modelArgs = self.modelArgs(args)

        self.logger.debug("ModelArgs: ")
        self.logger.debug("\n" + pformat(_modelArgs))
        self.loadData(cleanFN, _modelArgs)

        self.model = ClassificationModel(model_type=self.model_type,
                                         model_name=self.model_name,
                                         args=_modelArgs,
                                         use_cuda=isCudaAvailable,
                                         num_labels=2)

        if _modelArgs["lazy_loading"]:
            if not (isinstance(self.trainData, str)
                    and isinstance(self.testData, str)):
                self.logger.error("Lazy loading requires a string to a path.")
                self.logger.error(f"Train-Data-Type: {type(self.trainData)}")
                self.logger.error(f"Test-Data-Type: {type(self.testData)}")
                return None

        return self.model.train_model(train_df=self.trainData,
                                      eval_df=self.testData)
Beispiel #13
0
def predict_export(data):

    X = data[args.predict_partition]['text']
    predictions = {}

    for class_name in ['arousal', 'valence', 'topic']:  #

        if class_name in ['arousal', 'valence']:
            class_no = 3
        else:
            class_no = 10

        trained_model_path = os.path.join('experiments/best_model/',
                                          class_name + str(False))
        model = ClassificationModel(args.model_type,
                                    trained_model_path,
                                    num_labels=class_no)
        predictions['prediction_' + class_name], _ = model.predict(X)

    predictions['id'] = data[args.predict_partition]['id']
    predictions['segment_id'] = data[args.predict_partition]['segment_id']

    df = pd.DataFrame.from_dict(predictions)  # , orient='index' .T
    header_names = [
        'id', 'segment_id', 'prediction_arousal', 'prediction_valence',
        'prediction_topic'
    ]
    df[header_names].to_csv(output_path + args.predict_partition + '.csv',
                            header=header_names,
                            index=False)
Beispiel #14
0
def objective(args):
    pbar.update(1)
    try:
        # cast np values to python and convert list to dict
        args = list(map(int, args[:3])) + list(map(float, args[3:]))
        args = dict(
            zip([
                'train_batch_size', 'gradient_accumulation_steps',
                'weight_decay', 'learning_rate', 'learning_rate',
                'adam_epsilon', 'warmup_ratio', 'max_grad_norm'
            ], args))
        args['overwrite_output_dir'] = True
        args['eval_batch_size'] = args['train_batch_size']
        model = ClassificationModel('albert', 'albert-base-v1', num_labels=5)

        # train model, find reverse f1, force garbage collection
        model.train_model(train, args=args)
        result, *_ = model.eval_model(test,
                                      f1=f1_multiclass,
                                      acc=accuracy_score)
        del model
        return 1. - result['f1']
    except:
        print('skip')
        return 1.
def run_trainers(bucket_dir, train_args=None):

    os.makedirs('irl_models', exist_ok=True)

    if os.path.isfile('completed_irl.txt'):
        with open("completed_irl.txt", 'r') as f:
            done = [d.replace('\n', '') for d in f.readlines()]
    else:
        open('completed_irl.txt', 'a').close()
        with open("completed_irl.txt", 'r') as f:
            done = [d.replace('\n', '') for d in f.readlines()]
    for train_file in os.listdir(bucket_dir):
        print(train_file[5:])
        print(done)
        if train_file[5:] not in done:
            train_df = pd.read_csv(bucket_dir + '/' + train_file +
                                   '/data_all.tsv',
                                   sep='\t')
            train_args['output_dir'] = f'irl_models/{train_file[5:]}/'
            train_args['cache_dir'] = f'cache_{train_file[5:]}/'

            train_args.update({'wandb_kwargs': {'name': train_file[5:]}})

            model = ClassificationModel('roberta',
                                        'roberta-base',
                                        args=train_args)
            print(train_df.head())
            model.train_model(train_df)

            with open("completed_irl.txt", 'a') as f:
                f.write(f"{train_file[5:]}\n")
            exit()

    with open("done.runs", 'w') as f:
        f.write(f"Done at {datetime.datetime.now()}")
Beispiel #16
0
def main():
    # load train & test data
    df_train = pd.read_csv("sentiment_train.csv")
    df_test = pd.read_csv("sentiment_test.csv")

    #set random seed
    random = 42

    # Train test split
    X_train, X_val, y_train, y_val = train_test_split(df_train['Sentence'],
                                                      df_train['Polarity'],
                                                      test_size=0.10,
                                                      random_state=random)
    train_dataset = pd.concat([X_train, y_train], axis=1)
    val_dataset = pd.concat([X_val, y_val], axis=1)

    # Load a pre-trained model, and train it with our data | See all models available: https://huggingface.co/transformers/pretrained_models.html
    # Create model ... args = parameters
    args = {
        'reprocess_input_data': True,
        'max_seq_length': 300,
        'num_train_epochs': 1,
        'fp16': False,
        'train_batch_size': 4,
        'overwrite_output_dir': True
    }
    my_model = ClassificationModel('roberta',
                                   'distilroberta-base',
                                   num_labels=2,
                                   use_cuda=True,
                                   cuda_device=0,
                                   args=args)
    # Train the model
    my_model.train_model(train_dataset)

    # Evaluate the model
    result, model_outputs, wrong_predictions = my_model.eval_model(
        val_dataset, acc=f1_score)
    pred_val = np.argmax(model_outputs, axis=1).tolist()

    print("Results on evaluation:")
    print("----------------------")
    print("F1 Score = {:.6f}\n".format(
        f1_score(y_val, pred_val, average='micro') * 100))

    print(classification_report(y_val, pred_val))
    print(confusion_matrix(y_val, pred_val))

    # get results on test set
    pred_test, _ = my_model.predict(df_test['Sentence'])

    # print f1 score
    print(f1_score(df_test.Polarity, pred_test))

    # print accuracy score
    print(accuracy_score(df_test.Polarity, pred_test))

    # save input/ground truth/prediction as one csv
    df_test['prediction'] = pred_test
    df_test.to_csv('q3_ans.csv', index=False)
Beispiel #17
0
def train(human_file, gen_file, our_gen_file, output_dir):
    data = []
    data += [(i.strip(), 1) for i in open(human_file,'r').readlines()]
    data += [(i.strip(), 0) for i in open(gen_file,'r').readlines()]
    data += [(i.strip(), 0) for i in open(our_gen_file,'r').readlines()]

    all_df = pd.DataFrame(data)

    train_args = {
    'overwrite_output_dir':True,
    'num_train_epochs':  10,
    'process_count': 10,
    'train_batch_size': 10,
    'eval_batch_size': 20,
    'max_seq_length': 300,
    'reprocess_input_data':True,
    'learning_rate':1e-5,
    "evaluate_during_training": True,
    "use_early_stopping":True,
    'early_stopping_patience':3,
    "early_stopping_metric": "eval_loss",
    "early_stopping_metric_minimize": True,
    "no_cache":True,
    'output_dir':output_dir
    }

    model = ClassificationModel('roberta', "roberta-base", args=train_args) # You can set class weights by using the optional weight argument

    # Train the model

    model.train_model(all_df)
    print("finish the training")
Beispiel #18
0
def eval(model_path, our_gen_file, human_file):
    gen = open(our_gen_file, 'r').readlines()
    gen = [i.strip() for i in gen]
    human = open(human_file, 'r').readlines()
    human = [i.strip() for i in human]

    assert len(human) - len(gen) == 0, "please balance the eval file"

    test_df = pd.DataFrame(gen+human)
    test_input = test_df.sample(frac=1, random_state=123)

    train_args={
        'learning_rate':3e-5,
        'num_train_epochs': 5,
        'reprocess_input_data': True,
        'overwrite_output_dir': False,
        'process_count': 10,
        'train_batch_size': 4,
        'eval_batch_size': 400,
        'max_seq_length': 300,
        "fp16":False
    }

    model = ClassificationModel('roberta', model_path, num_labels=4, use_cuda=True, cuda_device=0, args=train_args)

    result, model_outputs, wrong_predictions = model.eval_model(test_input)
    print(result)
Beispiel #19
0
def generate_prob_matrix(arguments):
	my_args = {
		"max_seq_length": 256,
		"train_batch_size": 16,
		"eval_batch_size": 16,
		"do_lower_case": True,
		"manual_seed": 17
	}

	model = ClassificationModel('bert', "relation_processing/model/bert", use_cuda=False, args=my_args)
	num_arguments = len(arguments)
	prob_matrix = np.zeros((num_arguments, num_arguments))
	for rel_from in range(1, num_arguments):
		for rel_to in arguments[rel_from].compare_list:
			if rel_from == rel_to:
				continue
			logging.info("calculating: " + str(rel_from) + "-->" + str(rel_to))
			
			timer = datetime.now()
			predictions, raw_outputs = model.predict([[arguments[rel_to].sentence, arguments[rel_from].sentence]])
			rel = softmax(raw_outputs, axis=1)
			Stats.h_bert_time += datetime.now() - timer
			Stats.h_bert += 1
			
			logging.debug(rel)
			prob_matrix[rel_to][rel_from] = rel[0][1]
	return prob_matrix
Beispiel #20
0
def loadmodels():  # type: () -> None
    """
  Loads in-memory resources for classification and search
  """

    global accsearch, unaccsearch, eulamodel

    accsearch = [
        row for row in helpers.accExamples
        if helpers.goodsize(row['Clause Text'])
    ]
    accsearch = [addtoks(row) for row in accsearch]
    unaccsearch = [
        row for row in helpers.unaccExamples
        if helpers.goodsize(row['Clause Text'])
    ]
    unaccsearch = [addtoks(row) for row in unaccsearch]
    modeldir = helpers.getmodelfolder()
    accargs = buildbertargs()
    accargs.output_dir = modeldir
    eulamodel = ClassificationModel('roberta',
                                    modeldir,
                                    args=accargs,
                                    weight=[2, 1],
                                    use_cuda=False)
 def __init__(self, model_type, model_name, model_args):
     self.train_df = pd.read_pickle("D:/Language Models/train_df_500000")
     self.eval_df = pd.read_pickle("D:/Language Models/test_df_500000")
     self.model = ClassificationModel(model_type,
                                      model_name,
                                      use_cuda=False,
                                      args=model_args)
def train_stance_clf(data_dir, output_dir, **kwargs):
    headlines, bodies, labels = fnc(
        os.path.join(data_dir, 'combined_stances_train.csv'),
        os.path.join(data_dir, 'combined_bodies_train.csv'))

    list_of_tuples = list(zip(headlines, bodies, labels))
    df = pd.DataFrame(list_of_tuples, columns=['text_a', 'text_b', 'label'])
    train_df, val_df = train_test_split(df, random_state=123)
    train_args = {
        'learning_rate': 3e-5,
        'num_train_epochs': 5,
        'reprocess_input_data': True,
        'overwrite_output_dir': False,
        'process_count': 10,
        'train_batch_size': 4,
        'eval_batch_size': 20,
        'max_seq_length': 300,
        "fp16": False,
        'output_dir': output_dir
    }

    model = ClassificationModel('roberta',
                                "roberta-base",
                                num_labels=4,
                                use_cuda=True,
                                cuda_device=0,
                                args=train_args)

    model.train_model(train_df)
Beispiel #23
0
 def __init__(self,
              dir_path,
              model_path,
              resources_path,
              use_cuda,
              debugging=False):
     self.dir_path = dir_path
     self.model_path = model_path
     self.resources_path = resources_path
     self.debugging = debugging
     self.mapper = FineGrainedClassifier(self.resources_path)
     if os.path.exists(model_path) is False:
         print('Model Path not found!')
         return
     #initializing models
     bert_model_path = os.path.join(model_path, "bert_model")
     self.bert_model = ClassificationModel('bert',
                                           bert_model_path,
                                           use_cuda=use_cuda,
                                           args={'from_tf': False})
     print("Initialized BERT model")
     self.svm_est_model = pickle.load(
         open(os.path.join(model_path, 'svm_estimator.sav'), 'rb'))
     print("Initialized SVM model.")
     self.lr_model = pickle.load(
         open(os.path.join(model_path, 'lr.sav'), 'rb'))
     print("Initialized LR model.")
def eval_stance_clf(model_path, src_path, gen_path, **kwargs):
    src = open(src_path, 'r').readlines()
    gen = open(gen_path, 'r').readlines()
    gen = [i.strip() for i in gen]
    src = [i.strip() for i in src]

    train_args = {
        'learning_rate': 3e-5,
        'num_train_epochs': 5,
        'reprocess_input_data': True,
        'overwrite_output_dir': False,
        'process_count': 10,
        'train_batch_size': 4,
        'eval_batch_size': 400,
        'max_seq_length': 300,
        "fp16": False
    }

    model = ClassificationModel('roberta',
                                model_path,
                                num_labels=4,
                                use_cuda=True,
                                cuda_device=0,
                                args=train_args)

    input = [[i, j] for i, j in zip(src, gen)]
    predictions, raw_outputs = model.predict(input)
    th = Counter(predictions)
    th = sorted(th.items(), key=lambda x: x[0])
    print(th)
Beispiel #25
0
def transformer(train_df, eval_df, datafile):

    #tokenizer = BertTokenizer.from_pretrained("bert-base-dutch-cased")
    model = ClassificationModel(
        "bert", "bert-base-dutch-cased", use_cuda=False, num_labels=2
    )  # You can set class weights by using the optional weight argument

    # Train the model
    model.train_model(train_df)

    result, model_outputs, wrong_predictions = model.eval_model(eval_df)
    print(model_outputs)

    predlist = []
    model1_outputs = model_outputs.tolist()
    for output in model1_outputs:
        if output[0] > output[1]:
            prediction = 0
        else:
            prediction = 1
        predlist.append(prediction)

    labels = eval_df["labels"].tolist()
    print(labels)
    print(predlist)

    print(classification_report(labels, predlist))
    print(confusion_matrix(labels, predlist))
    print(accuracy_score(labels, predlist))
Beispiel #26
0
def get_evaluation_parameter(model):
    eval_df = pd.read_csv("data/reviews/new_test.csv", header=None)
    eval_df.columns = ["text", "labels"]

    model_type = f'outputs/{model}/best_model'
    model = ClassificationModel(model, model_type)
    result, model_outputs, wrong_predictions = model.eval_model(eval_df)
    print('Results:', result)
    print('Outputs:', model_outputs)

    plots = []
    differences = []
    max_difference = 0
    min_difference = 5
    for i in range(len(model_outputs)):
        value = round(abs(model_outputs[i] - eval_df['labels'][i]), 2)
        actual = round(eval_df['labels'][i], 2)
        plots.append([actual, model_outputs[i], value])

        if value > max_difference:
            max_difference = value
        if value < min_difference:
            min_difference = value

        differences.append(value)

    print('Max Difference:', max_difference)  # 3.8447265625
    print('Min Difference:', min_difference)  # 0.0

    parameter = sum(differences) / len(differences)
    print('Parameter:', parameter)  # 0.40202807008058644

    pd.DataFrame(differences).to_csv("test.csv", index=None)
    pd.DataFrame(plots).to_csv("plots.csv", index=None)
Beispiel #27
0
def fake_classify(train_set, eval_set, test_set, seed):

    # Create a TransformerModel

    model = ClassificationModel('bert',
                                'bert-base-multilingual-uncased',
                                args={
                                    'num_train_epochs': 3,
                                    'overwrite_output_dir': True,
                                    'manual_seed': seed
                                },
                                use_cuda=True)
    print(model.args)
    # Train the model
    model.train_model(train_set)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_set,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score)
    #print('Evaluation results = ', results(results))

    #save the model

    #import torch
    #torch.save(model, path) --> no need to do this, model gets saved in output dir

    return result, model_outputs, wrong_predictions
def init(model_path):
    """
    Loads Bert Model
    :param model_path: Path of BERT Model to load
    :return: model
    """
    os.environ['WANDB_MODE'] = 'dryrun'
    logging.set_verbosity_warning()
    train_args = {
        "reprocess_input_data": True,
        "fp16": False,
        "num_train_epochs": 30,
        "overwrite_output_dir": True,
        "save_model_every_epoch": True,
        "save_eval_checkpoints": True,
        "learning_rate": 5e-7,  # default 5e-5
        "save_steps": 5000,
        #"output_dir": output_dir,
        "warmup_steps": 2000,
        #"best_model_dir": output_dir + "/best_model/"
    }
    model = ClassificationModel("bert",
                                model_path,
                                num_labels=2,
                                args=train_args)
    print(model.device)
    return model
Beispiel #29
0
def fake_classify(train_set, eval_set, test_set, seed):

    # Create a TransformerModel

    model = ClassificationModel('bert',
                                'bert-base-multilingual-uncased',
                                args={
                                    'max_seq_length': 512,
                                    'num_train_epochs': 3,
                                    'overwrite_output_dir': True,
                                    'manual_seed': seed
                                },
                                use_cuda=True)
    print(model.args)

    # Train the model
    model.train_model(train_set)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_set,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score)
    #print('Evaluation results = ', results(results))

    return result, model_outputs, wrong_predictions
Beispiel #30
0
def create_model(model_class, model_type, model_name, num_labels, weight, args,
                 use_cuda, cuda_device, **kwargs):
    if model_class == "ClassificationModel":
        return ClassificationModel(model_type, model_name, num_labels, weight,
                                   args, use_cuda, cuda_device, **kwargs)
    elif model_class == "MultiLabelClassificationModel":
        return MultiLabelClassificationModel(model_type, model_name,
                                             num_labels, weight, args,
                                             use_cuda, cuda_device, **kwargs)
    elif model_class == "QuestionAnsweringModel":
        return QuestionAnsweringModel(model_type, model_name, args, use_cuda,
                                      cuda_device, **kwargs)
    elif model_class == "NERModel":
        return NERModel(model_type,
                        model_name,
                        args=args,
                        use_cuda=use_cuda,
                        cuda_device=cuda_device,
                        **kwargs)
    elif model_class == "T5Model":
        args = T5Args()
        args.use_multiprocessed_decoding = False
        return T5Model(model_type,
                       model_name,
                       args=args,
                       use_cuda=use_cuda,
                       cuda_device=cuda_device,
                       **kwargs)
    else:
        raise ValueError(
            "{} is either invalid or not yet implemented.".format(model_class))