Example #1
0
def eval(model_path, our_gen_file, human_file):
    gen = open(our_gen_file, 'r').readlines()
    gen = [i.strip() for i in gen]
    human = open(human_file, 'r').readlines()
    human = [i.strip() for i in human]

    assert len(human) - len(gen) == 0, "please balance the eval file"

    test_df = pd.DataFrame(gen+human)
    test_input = test_df.sample(frac=1, random_state=123)

    train_args={
        'learning_rate':3e-5,
        'num_train_epochs': 5,
        'reprocess_input_data': True,
        'overwrite_output_dir': False,
        'process_count': 10,
        'train_batch_size': 4,
        'eval_batch_size': 400,
        'max_seq_length': 300,
        "fp16":False
    }

    model = ClassificationModel('roberta', model_path, num_labels=4, use_cuda=True, cuda_device=0, args=train_args)

    result, model_outputs, wrong_predictions = model.eval_model(test_input)
    print(result)
Example #2
0
def train_model(args, output_dir, cache_dir):
    """
    Train a SimpleTransformers model based on the given arguments, save and return it.
    :param args: Arguments as processed by parse_args() containing architecture and epochs.
    :param output_dir: Path to the directory in which the model should be stored.
    :param cache_dir: Path to the directory in which the cache should be stored.
    :return: SimpleTransformers model trained based on the given arguments.
    """
    print('=> Training model...')

    # Set model arguments
    model_args = {
        'num_train_epochs': args.num_epochs,
        'train_batch_size': 32,
        'eval_batch_size': 32,
        'output_dir': output_dir,
        'cache_dir': cache_dir
    }

    # Train the model
    pretrained = get_transformer_model(args.arch)
    model = ClassificationModel(args.arch,
                                pretrained,
                                use_cuda=True,
                                args=model_args)
    train = load_corpus('train')
    model.train_model(train)

    return model
def main():
    f_path = 'Breast Cancer(Raw_data_2_Classes).csv'
    data = loadDataAsDataFrame(f_path)
    X = data
    y = data['Class'].tolist()
    training_set_size = int(0.8 * len(X))
    training_rows, test_rows, training_classes, test_classes = train_test_split(
        X, y, train_size=training_set_size, random_state=42069)
    training_rows, test_rows, training_classes, test_classes = train_test_split(
        X, y, train_size=training_set_size, random_state=42069)
    model_args = {'overwrite_output_dir': True}
    # Create a TransformerModel
    model = ClassificationModel('roberta',
                                'roberta-base',
                                use_cuda=False,
                                args=model_args)
    #model = ClassificationModel('roberta', 'roberta-base', use_cuda=True, args=model_args)

    #change our data into a format that simpletransformers can process
    training_rows['text'] = training_rows['Text']
    training_rows['labels'] = training_rows['Class']
    test_rows['text'] = test_rows['Text']
    test_rows['labels'] = test_rows['Class']

    # Train the model
    model.train_model(training_rows)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(test_rows)

    print("f1 score")
    precision = result['tp'] / (result['tp'] + result['fp'])
    recall = result['tp'] / (result['tp'] + result['fn'])
    f1score = 2 * precision * recall / (precision + recall)
    print(f1score)
            def train():
                wandb.init(WAND_PROJECT_NAME)
                modelArgs = { "max_seq_length": self.maxSeqLength, "output_dir": self.modelOutputDir, "overwrite_output_dir": True, "best_model_dir": self.bestModelOutputDir,
                              "wandb_project": WAND_PROJECT_NAME, "num_training_epochs": wandb.config.epochs, "learning_rate": wandb.config.learning_rate,
                              "do_lower_case": True, "cache_dir": self.modelCacheDir, "encoding": "utf-8", "train_batch_size": 5, "eval_batch_size": 5,
                              "evaluate_during_training_steps": 50, "evaluate_during_training_verbose": True, "logging_steps": 5, "sliding_window": True,
                              "reprocess_input_data": True, "evaluate_during_training": True, "use_multiprocessing": True,
                              "labels_list": SECTOR_LABELS }

                model = ClassificationModel(self.modelType, self.modelNameOrPath, args=modelArgs, sweep_config=wandb.config, use_cuda=torch.cuda.is_available(), num_labels=len(SECTOR_LABELS), )

                # Training and evaluation
                try:
                    log.info(f"Started training/finetuning BERT on multi-class classification task..")
                    model.train_model(train_df=self.trainDataset, eval_df=self.evalDataset, show_running_loss=True,
                                      output_dir=self.modelOutputDir,
                                      mcc=sklearn.metrics.matthews_corrcoef,
                                      acc=sklearn.metrics.balanced_accuracy_score, )
                    log.info(f"Finished finetuning and evaluating our fine-tuned model on multi-class classification task. Check the folder '{self.modelOutputDir}' for finetuned weights.")
                    log.info(f"It took {round((time.time() - startTime) / 3600, 1)} hours to finetune and evaluate our fine-tuned model on multi-class classification task.")
                except:
                    exc_type, exc_value, exc_traceback = sys.exc_info()
                    err = f"Error occurred while training and evaluating the finetuned model on multi-class classification task. Error is: {exc_type}; {exc_value}."
                    log.error(err)

                wandb.join()
Example #5
0
def train_model(train_df, num_labels):
    model_type, model_name = MODELNAME.split(";")
    model_output = 'models/{}-{}-{}'.format(TAG, model_type, model_name.replace("/", "-"))
    if OVERWRITE is False and os.path.exists(model_output):
        logging.info("Skipping training of {}".format(model_name))
        sys.exit(0)
    logging.info("Starting training of {}".format(model_name))
    run = wandb.init(project=model_output.split("/")[-1], reinit=True)

    model = ClassificationModel(
        model_type, model_name, num_labels=num_labels, args={
            'output_dir': model_output,
            'overwrite_output_dir': OVERWRITE,
            'best_model_dir': '{}/best'.format(model_output),
            'evaluate_during_training': False,
            'manual_seed': 42,
            'num_train_epochs': 4,
            # 'learning_rate': 2e-5,  # For BERT, 5e-5, 3e-5, 2e-5
            # For BERT 16, 32. It could be 128, but with gradient_acc_steps set to 2 is equivalent
            'train_batch_size': 8 if "large" in model_name else 32,
            'eval_batch_size': 8 if "large" in model_name else 32,
            # Doubles train_batch_size, but gradients and weights are calculated once every 2 steps
            'gradient_accumulation_steps': 2 if "large" in model_name else 1,
            'max_seq_length': 256,
            'sliding_window': False,
            'wandb_project': model_output.split("/")[-1],
            # "adam_epsilon": 3e-5,  # 1e-8
            "silent": False,
            "fp16": False,  # By default it uses 32 bit floating point
            "n_gpu": 1,
    })
    # train the model
    model.train_model(train_df)
    return model, run
Example #6
0
 def load_models(self):
     args = {'eval_batch_size': 32, 'silent': True}
     self.model = ClassificationModel(self.model_emb,
                                      self.tr_path,
                                      num_labels=self.num_classes,
                                      args=args,
                                      use_cuda=False)
Example #7
0
def train(human_file, gen_file, our_gen_file, output_dir):
    data = []
    data += [(i.strip(), 1) for i in open(human_file,'r').readlines()]
    data += [(i.strip(), 0) for i in open(gen_file,'r').readlines()]
    data += [(i.strip(), 0) for i in open(our_gen_file,'r').readlines()]

    all_df = pd.DataFrame(data)

    train_args = {
    'overwrite_output_dir':True,
    'num_train_epochs':  10,
    'process_count': 10,
    'train_batch_size': 10,
    'eval_batch_size': 20,
    'max_seq_length': 300,
    'reprocess_input_data':True,
    'learning_rate':1e-5,
    "evaluate_during_training": True,
    "use_early_stopping":True,
    'early_stopping_patience':3,
    "early_stopping_metric": "eval_loss",
    "early_stopping_metric_minimize": True,
    "no_cache":True,
    'output_dir':output_dir
    }

    model = ClassificationModel('roberta', "roberta-base", args=train_args) # You can set class weights by using the optional weight argument

    # Train the model

    model.train_model(all_df)
    print("finish the training")
def train_stance_clf(data_dir, output_dir, **kwargs):
    headlines, bodies, labels = fnc(
        os.path.join(data_dir, 'combined_stances_train.csv'),
        os.path.join(data_dir, 'combined_bodies_train.csv'))

    list_of_tuples = list(zip(headlines, bodies, labels))
    df = pd.DataFrame(list_of_tuples, columns=['text_a', 'text_b', 'label'])
    train_df, val_df = train_test_split(df, random_state=123)
    train_args = {
        'learning_rate': 3e-5,
        'num_train_epochs': 5,
        'reprocess_input_data': True,
        'overwrite_output_dir': False,
        'process_count': 10,
        'train_batch_size': 4,
        'eval_batch_size': 20,
        'max_seq_length': 300,
        "fp16": False,
        'output_dir': output_dir
    }

    model = ClassificationModel('roberta',
                                "roberta-base",
                                num_labels=4,
                                use_cuda=True,
                                cuda_device=0,
                                args=train_args)

    model.train_model(train_df)
Example #9
0
def fake_classify(train_set, eval_set, test_set, seed):

    # Create a TransformerModel

    model = ClassificationModel('bert',
                                'bert-base-multilingual-uncased',
                                args={
                                    'max_seq_length': 512,
                                    'num_train_epochs': 3,
                                    'overwrite_output_dir': True,
                                    'manual_seed': seed
                                },
                                use_cuda=True)
    print(model.args)

    # Train the model
    model.train_model(train_set)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_set,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score)
    #print('Evaluation results = ', results(results))

    return result, model_outputs, wrong_predictions
 def __init__(self, model_type, model_name, model_args):
     self.train_df = pd.read_pickle("D:/Language Models/train_df_500000")
     self.eval_df = pd.read_pickle("D:/Language Models/test_df_500000")
     self.model = ClassificationModel(model_type,
                                      model_name,
                                      use_cuda=False,
                                      args=model_args)
Example #11
0
    def train(self, split=0.7, num_epochs=10):
        self.le = preprocessing.LabelEncoder()
        print(list(self.dat.keys()))
        self.le.fit(list(self.dat.keys()))

        train_data = []
        eval_data = []
        for k, v in self.dat.items():
            len_train = int(round(len(v) * split))
            train_data.extend([[i, self.le.transform([k])[0]]
                               for i in v[:len_train]])

            eval_data.extend([[i, self.le.transform([k])[0]]
                              for i in v[len_train:]])

        print(train_data, eval_data)
        train_df = pd.DataFrame(train_data)
        eval_df = pd.DataFrame(eval_data)
        train_args = {
            'overwrite_output_dir': True,
            'num_train_epochs': num_epochs,
        }
        self.model = ClassificationModel(self.model_type,
                                         self.model_name,
                                         num_labels=len(list(self.dat.keys())),
                                         use_cuda=self.use_cuda,
                                         cuda_device=0,
                                         args=train_args)
        # Train the model
        self.model.train_model(train_df, eval_df=eval_df)

        # Evaluate the model
        result, model_outputs, wrong_predictions = self.model.eval_model(
            eval_df, acc=sklearn.metrics.accuracy_score)
Example #12
0
def generate_prob_matrix(arguments):
	my_args = {
		"max_seq_length": 256,
		"train_batch_size": 16,
		"eval_batch_size": 16,
		"do_lower_case": True,
		"manual_seed": 17
	}

	model = ClassificationModel('bert', "relation_processing/model/bert", use_cuda=False, args=my_args)
	num_arguments = len(arguments)
	prob_matrix = np.zeros((num_arguments, num_arguments))
	for rel_from in range(1, num_arguments):
		for rel_to in arguments[rel_from].compare_list:
			if rel_from == rel_to:
				continue
			logging.info("calculating: " + str(rel_from) + "-->" + str(rel_to))
			
			timer = datetime.now()
			predictions, raw_outputs = model.predict([[arguments[rel_to].sentence, arguments[rel_from].sentence]])
			rel = softmax(raw_outputs, axis=1)
			Stats.h_bert_time += datetime.now() - timer
			Stats.h_bert += 1
			
			logging.debug(rel)
			prob_matrix[rel_to][rel_from] = rel[0][1]
	return prob_matrix
Example #13
0
    def train(self, train_data: object, eval_data: object) -> object:
        """
        Create and train the chosen model based on the args

        Parameters
        ----------
        train_data : object
            train split of the train_data.
        eval_data : object
            validation split of the train_data.

        Returns
        -------
        object
            model.

        """

        # Create a ClassificationModel
        model = ClassificationModel(
            self.model_name,
            self.model_type,
            args=self.model_args,
            use_cuda=self.cuda,
            num_labels=len(self.labels) - 1,
        )
        # Train the model
        model.train_model(train_df=train_data,
                          eval_df=eval_data,
                          accuracy=accuracy_score)
        return model
def test_binary_classification(model_type, model_name):
    # Train and Evaluation data needs to be in a Pandas Dataframe of two columns.
    # The first column is the text with type str, and the second column is the
    # label with type int.
    train_data = [
        ["Example sentence belonging to class 1", 1],
        ["Example sentence belonging to class 0", 0],
    ]
    train_df = pd.DataFrame(train_data)

    eval_data = [
        ["Example eval sentence belonging to class 1", 1],
        ["Example eval sentence belonging to class 0", 0],
    ]
    eval_df = pd.DataFrame(eval_data)

    # Create a ClassificationModel
    model = ClassificationModel(
        model_type,
        model_name,
        use_cuda=False,
        args={
            "reprocess_input_data": True,
            "overwrite_output_dir": True
        },
    )

    # Train the model
    model.train_model(train_df)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(eval_df)
def run_trainers(bucket_dir, train_args=None):

    os.makedirs('irl_models', exist_ok=True)

    if os.path.isfile('completed_irl.txt'):
        with open("completed_irl.txt", 'r') as f:
            done = [d.replace('\n', '') for d in f.readlines()]
    else:
        open('completed_irl.txt', 'a').close()
        with open("completed_irl.txt", 'r') as f:
            done = [d.replace('\n', '') for d in f.readlines()]
    for train_file in os.listdir(bucket_dir):
        print(train_file[5:])
        print(done)
        if train_file[5:] not in done:
            train_df = pd.read_csv(bucket_dir + '/' + train_file +
                                   '/data_all.tsv',
                                   sep='\t')
            train_args['output_dir'] = f'irl_models/{train_file[5:]}/'
            train_args['cache_dir'] = f'cache_{train_file[5:]}/'

            train_args.update({'wandb_kwargs': {'name': train_file[5:]}})

            model = ClassificationModel('roberta',
                                        'roberta-base',
                                        args=train_args)
            print(train_df.head())
            model.train_model(train_df)

            with open("completed_irl.txt", 'a') as f:
                f.write(f"{train_file[5:]}\n")
            exit()

    with open("done.runs", 'w') as f:
        f.write(f"Done at {datetime.datetime.now()}")
def eval_stance_clf(model_path, src_path, gen_path, **kwargs):
    src = open(src_path, 'r').readlines()
    gen = open(gen_path, 'r').readlines()
    gen = [i.strip() for i in gen]
    src = [i.strip() for i in src]

    train_args = {
        'learning_rate': 3e-5,
        'num_train_epochs': 5,
        'reprocess_input_data': True,
        'overwrite_output_dir': False,
        'process_count': 10,
        'train_batch_size': 4,
        'eval_batch_size': 400,
        'max_seq_length': 300,
        "fp16": False
    }

    model = ClassificationModel('roberta',
                                model_path,
                                num_labels=4,
                                use_cuda=True,
                                cuda_device=0,
                                args=train_args)

    input = [[i, j] for i, j in zip(src, gen)]
    predictions, raw_outputs = model.predict(input)
    th = Counter(predictions)
    th = sorted(th.items(), key=lambda x: x[0])
    print(th)
Example #17
0
def transformer(train_df, eval_df, datafile):

    #tokenizer = BertTokenizer.from_pretrained("bert-base-dutch-cased")
    model = ClassificationModel(
        "bert", "bert-base-dutch-cased", use_cuda=False, num_labels=2
    )  # You can set class weights by using the optional weight argument

    # Train the model
    model.train_model(train_df)

    result, model_outputs, wrong_predictions = model.eval_model(eval_df)
    print(model_outputs)

    predlist = []
    model1_outputs = model_outputs.tolist()
    for output in model1_outputs:
        if output[0] > output[1]:
            prediction = 0
        else:
            prediction = 1
        predlist.append(prediction)

    labels = eval_df["labels"].tolist()
    print(labels)
    print(predlist)

    print(classification_report(labels, predlist))
    print(confusion_matrix(labels, predlist))
    print(accuracy_score(labels, predlist))
Example #18
0
def predict_export(data):

    X = data[args.predict_partition]['text']
    predictions = {}

    for class_name in ['arousal', 'valence', 'topic']:  #

        if class_name in ['arousal', 'valence']:
            class_no = 3
        else:
            class_no = 10

        trained_model_path = os.path.join('experiments/best_model/',
                                          class_name + str(False))
        model = ClassificationModel(args.model_type,
                                    trained_model_path,
                                    num_labels=class_no)
        predictions['prediction_' + class_name], _ = model.predict(X)

    predictions['id'] = data[args.predict_partition]['id']
    predictions['segment_id'] = data[args.predict_partition]['segment_id']

    df = pd.DataFrame.from_dict(predictions)  # , orient='index' .T
    header_names = [
        'id', 'segment_id', 'prediction_arousal', 'prediction_valence',
        'prediction_topic'
    ]
    df[header_names].to_csv(output_path + args.predict_partition + '.csv',
                            header=header_names,
                            index=False)
Example #19
0
    def __init__(self,
                 use_cuda=torch.cuda.is_available(),
                 cuda_device=0,
                 batch_size=16):
        self.model_type = "empathy"
        train_args["eval_batch_size"] = batch_size

        model_path = os.path.join(os.path.dirname(__file__), "models/empathy/")
        model_file = os.path.join(os.path.dirname(__file__),
                                  "models/empathy.tar.gz")
        if not os.path.isdir(model_path):
            model = f'{self.model_type}_model'
            if not os.path.isfile(model_file):
                logger.info(
                    f'Model {self.model_type} does not exist at {model_path}. Attempting to download it.'
                )
                fetch_pretrained_model(model, model_file)
            unzip_simple_transformer_model(model, model_path, model_file)

        # Create a ClassificationModel
        self.model = ClassificationModel('roberta',
                                         model_path,
                                         num_labels=1,
                                         use_cuda=use_cuda,
                                         cuda_device=cuda_device,
                                         args=train_args)
Example #20
0
    def train(self, args={}, cleanFN=CleanText().cleanText):
        self.logger.debug("Train Simpletransformer")
        isCudaAvailable = torch.cuda.is_available()

        if not isCudaAvailable:
            self.logger.warning("Training on CPU!")

        _modelArgs = self.modelArgs(args)

        self.logger.debug("ModelArgs: ")
        self.logger.debug("\n" + pformat(_modelArgs))
        self.loadData(cleanFN, _modelArgs)

        self.model = ClassificationModel(model_type=self.model_type,
                                         model_name=self.model_name,
                                         args=_modelArgs,
                                         use_cuda=isCudaAvailable,
                                         num_labels=2)

        if _modelArgs["lazy_loading"]:
            if not (isinstance(self.trainData, str)
                    and isinstance(self.testData, str)):
                self.logger.error("Lazy loading requires a string to a path.")
                self.logger.error(f"Train-Data-Type: {type(self.trainData)}")
                self.logger.error(f"Test-Data-Type: {type(self.testData)}")
                return None

        return self.model.train_model(train_df=self.trainData,
                                      eval_df=self.testData)
Example #21
0
class TransformerModel:
    """
    This class provides the Machine Learning model and classifies tenders based on previous training data.
    """
    def load_model(self):
        if not self.model:
            from simpletransformers.classification import ClassificationModel
            try:
                self.model = ClassificationModel('bert',
                                                 './outputs/',
                                                 use_cuda=False,
                                                 args=args)
            except Exception as ex:
                logger.error(
                    f"could not load model from /outputs due to {str(ex)}, creating new model"
                )
                self.create_new_model()

    def __init__(self):
        self.model = None

    def __convert_to_input(self, tenders):
        titles = list(map(lambda x: x.get_title("DE"), tenders))
        return titles

    def classify(self, tenders):
        self.load_model()

        titles = self.__convert_to_input(tenders)
        predictions, raw_output = self.model.predict(titles)
        tuples = zip(tenders, predictions)

        selected_tenders = [t for t, p in tuples if p == 1]
        return selected_tenders

    def train(self, labelled_tenders):
        self.load_model()

        tenders = [i for i, j in labelled_tenders]
        tenders = self.__convert_to_input(tenders)
        labels = [j for i, j in labelled_tenders]

        tenders_train, tenders_test, labels_train, labels_test = train_test_split(
            tenders, labels, test_size=0.1, random_state=42)

        data_input = pd.DataFrame(zip(tenders_train, labels_train))

        self.model.train_model(data_input)

        labels_pred, raw_output = self.model.predict(tenders_test)
        tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel()
        logger.info(f"tn: {tn} fp: {fp}")
        logger.info(f"fn: {fn} tp:{tp}")

    def create_new_model(self):
        from simpletransformers.classification import ClassificationModel
        self.model = ClassificationModel('bert',
                                         'bert-base-german-cased',
                                         use_cuda=False,
                                         args=args)
Example #22
0
def objective(args):
    pbar.update(1)
    try:
        # cast np values to python and convert list to dict
        args = list(map(int, args[:3])) + list(map(float, args[3:]))
        args = dict(
            zip([
                'train_batch_size', 'gradient_accumulation_steps',
                'weight_decay', 'learning_rate', 'learning_rate',
                'adam_epsilon', 'warmup_ratio', 'max_grad_norm'
            ], args))
        args['overwrite_output_dir'] = True
        args['eval_batch_size'] = args['train_batch_size']
        model = ClassificationModel('albert', 'albert-base-v1', num_labels=5)

        # train model, find reverse f1, force garbage collection
        model.train_model(train, args=args)
        result, *_ = model.eval_model(test,
                                      f1=f1_multiclass,
                                      acc=accuracy_score)
        del model
        return 1. - result['f1']
    except:
        print('skip')
        return 1.
Example #23
0
 def __init__(self,
              dir_path,
              model_path,
              resources_path,
              use_cuda,
              debugging=False):
     self.dir_path = dir_path
     self.model_path = model_path
     self.resources_path = resources_path
     self.debugging = debugging
     self.mapper = FineGrainedClassifier(self.resources_path)
     if os.path.exists(model_path) is False:
         print('Model Path not found!')
         return
     #initializing models
     bert_model_path = os.path.join(model_path, "bert_model")
     self.bert_model = ClassificationModel('bert',
                                           bert_model_path,
                                           use_cuda=use_cuda,
                                           args={'from_tf': False})
     print("Initialized BERT model")
     self.svm_est_model = pickle.load(
         open(os.path.join(model_path, 'svm_estimator.sav'), 'rb'))
     print("Initialized SVM model.")
     self.lr_model = pickle.load(
         open(os.path.join(model_path, 'lr.sav'), 'rb'))
     print("Initialized LR model.")
Example #24
0
def get_evaluation_parameter(model):
    eval_df = pd.read_csv("data/reviews/new_test.csv", header=None)
    eval_df.columns = ["text", "labels"]

    model_type = f'outputs/{model}/best_model'
    model = ClassificationModel(model, model_type)
    result, model_outputs, wrong_predictions = model.eval_model(eval_df)
    print('Results:', result)
    print('Outputs:', model_outputs)

    plots = []
    differences = []
    max_difference = 0
    min_difference = 5
    for i in range(len(model_outputs)):
        value = round(abs(model_outputs[i] - eval_df['labels'][i]), 2)
        actual = round(eval_df['labels'][i], 2)
        plots.append([actual, model_outputs[i], value])

        if value > max_difference:
            max_difference = value
        if value < min_difference:
            min_difference = value

        differences.append(value)

    print('Max Difference:', max_difference)  # 3.8447265625
    print('Min Difference:', min_difference)  # 0.0

    parameter = sum(differences) / len(differences)
    print('Parameter:', parameter)  # 0.40202807008058644

    pd.DataFrame(differences).to_csv("test.csv", index=None)
    pd.DataFrame(plots).to_csv("plots.csv", index=None)
Example #25
0
def train(train_df, max_sub_len, output_dir):
    model_type = 'distilbert'
    lr = 2e-5
    sent_length = max_sub_len
    OUTPUT_DIR = output_dir \
                 + str(datetime.datetime.now())[:19] + '_' + model_type + '_' + str(sent_length) + '_' + str(lr)
    print("model is saved at: {}".format(OUTPUT_DIR))
    training_config = {
        'output_dir': OUTPUT_DIR,
        'reprocess_input_data': True,
        'overwrite_output_dir': True,
        'num_train_epochs': 2,
        'train_batch_size': 32,
        'eval_batch_size': 32,
        'learning_rate': lr,
        'max_seq_length': sent_length
    }
    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)
    model = ClassificationModel(model_type,
                                'distilbert-base-cased',
                                num_labels=4,
                                args=training_config)
    torch.cuda.empty_cache()
    model.train_model(train_df)
    return model
Example #26
0
def bert_predictions(tweet: pd.DataFrame, model: ClassificationModel):
    """
    Bert Inference for prediction.
    :param tweet: dataframe with tweets
    :param model: Bert Model
    :return: list of pr
    """
    tweet = tweet.values.tolist()
    try:
        predictions, raw_outputs = model.predict(tweet)
    except:
        for element in tweet.iteritems():
            model.predict([element])
        print("STOPP")
    auswertung = collections.Counter(predictions)
    gc.collect()

    # df = pd.DataFrame(raw_outputs)
    # df['predictions'] = pd.DataFrame(predictions)
    # df['tweets'] = pd.DataFrame(tweet)
    # df = df.replace(r'\n', ' ', regex=True)
    # df_softmax = pd.DataFrame(softmax(raw_outputs, axis=1))
    # df['softmax0'] = df_softmax[0]
    # df['softmax1'] = df_softmax[1]
    # db_functions.df_to_sql(df, 'temp_table', 'replace')

    return auswertung
Example #27
0
def fake_classify(train_set, eval_set, test_set, seed):

    # Create a TransformerModel

    model = ClassificationModel('bert',
                                'bert-base-multilingual-uncased',
                                args={
                                    'num_train_epochs': 3,
                                    'overwrite_output_dir': True,
                                    'manual_seed': seed
                                },
                                use_cuda=True)
    print(model.args)
    # Train the model
    model.train_model(train_set)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_set,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score)
    #print('Evaluation results = ', results(results))

    #save the model

    #import torch
    #torch.save(model, path) --> no need to do this, model gets saved in output dir

    return result, model_outputs, wrong_predictions
    def __init__(self,
                 model_type: str,
                 model_name_or_path: Union[str, Path],
                 output_dir: Path,
                 class_weights: Optional[List[float]] = None
                 ):
        print('class weights: {}'.format(class_weights))
        self.output_dir = output_dir
        self.cache_dir = output_dir / 'cache/'
        self.tensorboard_dir = output_dir / 'runs/'
        self.best_model_dir = output_dir / 'output/best_model/'

        self.model_type = model_type
        self.model_name_or_path = model_name_or_path

        self.model = ClassificationModel(self.model_type,
                                         str(self.model_name_or_path),
                                         cache_dir='/media/sarthak/HDD/data_science/fnp_resources/pretrained_models/',
                                         args={'fp_16': True,
                                               'output_dir': str(self.output_dir),
                                               'cache_dir': str(self.cache_dir),
                                               'tensorboard_dir': str(self.tensorboard_dir),
                                               'best_model_dir': str(self.best_model_dir)},
                                         weight=class_weights
                                         )

        self.class_weights = class_weights
Example #29
0
def main(source=source,
         data_dir='data',
         checkpoint_dir="outputs/eval2/roberta_finetune_nogptneo",
         best_model_dir='outputs/eval2/best_model_roberta_finetune_nogptneo',
         n_train=240000,
         n_valid=4000,
         n_test=4000,
         n_epochs=10,
         learning_rate=4e-05,
         train_batch_size=64,
         eval_batch_size=64,
         evaluate_during_training=True,
         evaluate_during_training_steps=2000,
         reprocess_input=True,
         overwrite_output_dir=True,
         n_gpu=2):

    # import pdb; pdb.set_trace()
    train_df = data_loading.load_split(data_dir, source, 'train', n=n_train)
    valid_df = data_loading.load_split(data_dir,
                                       source_test,
                                       'valid',
                                       n=n_valid)
    test_df = data_loading.load_split(data_dir, source_test, 'test', n=n_test)

    # Optional model configuration
    model_args = ClassificationArgs(
        num_train_epochs=n_epochs,
        evaluate_during_training=evaluate_during_training,
        evaluate_during_training_steps=evaluate_during_training_steps,
        best_model_dir=best_model_dir,
        manual_seed=0,
        train_batch_size=train_batch_size,
        eval_batch_size=eval_batch_size,
        overwrite_output_dir=overwrite_output_dir,
        n_gpu=n_gpu,
        output_dir=checkpoint_dir,
        reprocess_input_data=reprocess_input,
        learning_rate=learning_rate)

    # Create a ClassificationModel
    model = ClassificationModel("roberta",
                                model_name="roberta-large",
                                args=model_args,
                                use_cuda=True)

    # Train the model
    model.train_model(train_df,
                      eval_df=valid_df,
                      f1=sklearn.metrics.f1_score,
                      acc=sklearn.metrics.accuracy_score,
                      eer=eer)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score,
        eer=eer)
class TransformerModel(TenderClassClassifier):
    """
    This class provides the Machine Learning model and classifies tenders based on previous training data.
    """
    def __init__(self):
        self.model = None

    def load(self, name):
        self.model = ClassificationModel('bert',
                                         './outputs/',
                                         use_cuda=cuda_available,
                                         args=args)

    def save(self, name):
        pass

    def __convert_to_input(self, tenders):
        titles = list(map(lambda x: x.get_title("DE"), tenders))
        return titles

    def classify(self, tenders):
        titles = self.__convert_to_input(tenders)
        predictions, raw_output = self.model.predict(titles)
        tuples = zip(tenders, predictions)

        selected_tenders = [t for t, p in tuples if p == 1]
        return selected_tenders

    def train(self, labelled_tenders):
        tenders = [i for i, j in labelled_tenders]
        tenders = self.__convert_to_input(tenders)
        labels = [j for i, j in labelled_tenders]

        tenders_train, tenders_test, labels_train, labels_test = train_test_split(
            tenders, labels, test_size=0.1, random_state=42)

        data_input = pd.DataFrame(zip(tenders_train, labels_train))

        start = time.time()
        self.model.train_model(data_input)
        end = time.time()

        print(end - start)

        labels_pred, raw_output = self.model.predict(tenders_test)
        tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel()
        logger.info(f"tn: {tn} fp: {fp}")
        logger.info(f"fn: {fn} tp:{tp}")

        logger.info(
            f"Accuracy Score: {accuracy_score(labels_test, labels_pred)}")

    def create_new_model(self):
        from simpletransformers.classification import ClassificationModel
        self.model = ClassificationModel('bert',
                                         'bert-base-german-cased',
                                         use_cuda=cuda_available,
                                         args=args)