Beispiel #1
0
    def __init__(self):

        print(f'ICD Prediction : Initializing')

        start = time.time()

        self.model = MultiLabelClassificationModel(
            'bert',
            "./model/checkpoint-108117-epoch-3/",
            num_labels=501,
            use_cuda=False,
            args={
                'train_batch_size': 2,
                'gradient_accumulation_steps': 16,
                'learning_rate': 3e-5,
                'num_train_epochs': 3,
                'max_seq_length': 512,
                'reprocess_input_data': True
            })
        self.text = ''
        self.output = dict()
        self.icd_dictionary = pd.read_csv('./icd_dictionary.csv')

        end = time.time()
        print(
            f'ICD Prediction : Initializing : Finished in {end-start:7.3f} Seconds\n'
        )
Beispiel #2
0
def main():
    df = pd.read_pickle("../data/preprocessed_dataset")

    df["text"] = df.questionText + ' ' + df.questionTitle
    df["text"] = df["text"].astype(str)
    df["labels"] = df.root_multi_label
    # defining model
    model = MultiLabelClassificationModel('bert',
                                          'bert-base-uncased',
                                          num_labels=3,
                                          use_cuda=False)
    # processing train and test data for multilabel classification
    train_df, test_df = train_test_split(df, test_size=0.3, random_state=333)
    train_df = train_df[['text', 'labels']]
    test_df = test_df[['text', 'labels']]
    train_df = train_df.reset_index()
    train_df.drop(['index'], axis=1, inplace=True)
    test_df = test_df.reset_index()
    test_df.drop(['index'], axis=1, inplace=True)
    model.train_model(train_df,
                      args={
                          'learning_rate': 1e-4,
                          'num_train_epochs': 10,
                          'reprocess_input_data': True,
                          'overwrite_output_dir': True,
                          "train_batch_size": 14
                      })
    # evaluate
    result, model_outputs, wrong_predictions = model.eval_model(test_df)
Beispiel #3
0
def evaluate(
    test_pkl,
    model_type,
    model_name,
    model_outputs_path,
    wrong_predictions_path,
):
    """
    Evaluate a fine-tuned multi-label classification model on a test set.
    Save evaluation metrics in a `eval_results.txt` file in the model directory. The metrics include: Label Ranking Average Precision (LRAP) and eval_loss.
    Save model outputs and wrong predictions in pickled files at `model_outputs_path` and `wrong_predictions_path`.

    Parameters
    ----------
    test_pkl: str
        path to pickled df with the test data, which must contain the columns 'text' and 'labels'; the labels are multi-hot lists (see column indices in `labels`), e.g. [1, 0, 0, 1, 0, 0, 0, 0, 1]
    model_type: str
        type of the pre-trained model, e.g. bert, roberta, electra
    model_name: str
        path to a directory containing model file
    model_outputs_path: str
        path to save the pickled model outputs
    wrong_predictions_path: str
        path to save the pickled wrong predictions

    Returns
    -------
    None
    """

    # load data
    test_data = pd.read_pickle(test_pkl)

    # check CUDA
    cuda_available = torch.cuda.is_available()
    if not cuda_available:
        def custom_formatwarning(msg, *args, **kwargs):
            return str(msg) + '\n'
        warnings.formatwarning = custom_formatwarning
        warnings.warn('CUDA device not available; running on a CPU!')

    # load model
    model = MultiLabelClassificationModel(
        model_type,
        model_name,
        use_cuda=cuda_available,
    )

    # evaluate model
    result, model_outputs, wrong_predictions = model.eval_model(test_data)

    # save evaluation outputs
    with open(model_outputs_path, 'wb') as f:
        pickle.dump(model_outputs, f)

    with open(wrong_predictions_path, 'wb') as f:
        pickle.dump(wrong_predictions, f)
    def eval(self, data: pd.DataFrame, pretrained_model: str = None):
        """
        **Evaluate the model performance with classification_report from sklearn.
         (for multi-label classification).**

        parameters:
        ----------
            :param data: A Pandas DataFrame with minimum two columns.
                type: pd.DataFrame.
                "text" column (containing the input text after farasa_tokenization).
                "labels" column (list type object of binary labels) - the ground truth.
            :param pretrained_model:
                type: str.
                Location of a pretrained BERT model. If None, will use the value from the config
                file ("outputs\" as Default).

        returns:
        --------
            :return: (predict_df,performance_report)
            type: A tuple of a dataframe and a string:

                predict_df: A Pandas DataFrame with 3 columns:
                    "text" column : containing the input text after farasa_tokenization.
                    "ground_truth" column : the ground truth labels (multi-label).
                    "predictions" column : the predicted labels (multi-label).
                evaluation_report:
                    type: str.
                    Evaluation report of the model performance with sklearn.metrics.classification_report.
        """
        if pretrained_model is None:
            pretrained_model = self.model_args['output_dir']

        self.model = MultiLabelClassificationModel(
            'bert',
            pretrained_model,
            num_labels=self.num_of_features,
            use_cuda=self.use_cuda)

        predictions, raw_outputs = self.model.predict(data['text'])
        ground_truth = data['labels'].to_list()
        evaluation_report = classification_report(
            ground_truth,
            predictions,
            target_names=self.emotion_list,
            zero_division=0)
        if self.verbose:
            print(evaluation_report)
        predict_df = data.copy()
        pred = pd.DataFrame(predictions, columns=self.emotion_list)
        predict_df['predictions'] = pred.dot(
            pd.Index(self.emotion_list) + ', ').str.strip(', ')
        gt = pd.DataFrame(ground_truth, columns=self.emotion_list)
        predict_df['ground_truth'] = gt.dot(
            pd.Index(self.emotion_list) + ', ').str.strip(', ')

        return predict_df[['text', 'ground_truth',
                           'predictions']], evaluation_report
Beispiel #5
0
def predict_df(
    data_pkl,
    model_type,
    model_name,
):
    """
    Apply a fine-tuned multi-label classification model to generate predictions.
    The text is given in `data_pkl` and the predictions are generated per row and saved in a 'predictions' column.

    Parameters
    ----------
    data_pkl: str
        path to pickled df with the data, which must contain the column 'text'
    model_type: str
        type of the pre-trained model, e.g. bert, roberta, electra
    model_name: str
        path to a directory containing model file

    Returns
    -------
    None
    """

    # load data
    df = pd.read_pickle(data_pkl)

    # check CUDA
    cuda_available = torch.cuda.is_available()
    if not cuda_available:

        def custom_formatwarning(msg, *args, **kwargs):
            return str(msg) + '\n'

        warnings.formatwarning = custom_formatwarning
        warnings.warn('CUDA device not available; running on a CPU!')

    # load model
    model = MultiLabelClassificationModel(
        model_type,
        model_name,
        use_cuda=cuda_available,
    )

    # predict
    print("Generating predictions. This might take a while...")
    txt = df['text'].to_list()
    predictions, _ = model.predict(txt)

    col = f"pred_{Path(model_name).stem}"
    df[col] = predictions

    # pkl df
    df.to_pickle(data_pkl)
    print(
        f"A column with predictions was added.\nThe updated df is saved: {data_pkl}"
    )
 def load_model(self):
     # Hydrate the serialized objects.
     encoder_filename = self.model_path + "/encoder.pkl"
     print("loading model from: ", self.model_path)
     try:
         with open(encoder_filename, 'rb') as f:
             self.one_hot = pickle.load(f)
         self.model = MultiLabelClassificationModel('xlnet',
                                                    self.model_path,
                                                    use_cuda=self.use_cuda)
     except Exception as e:
         print("error:", e)
         print("couldn't load models from disk")
Beispiel #7
0
    def train():
        wandb.init()

        model = MultiLabelClassificationModel(
            model_type,
            model_name,
            num_labels=len(labels),
            args=model_args,
            use_cuda=cuda_available,
        )

        model.train_model(train_data, eval_df=eval_data)

        wandb.join()
Beispiel #8
0
    def nn(self, langu, text):
        global transformer_keys
        global transformer_engines
        try:

            if (langu in transformer_keys.keys()):
                keys = transformer_keys[langu]
                numKeys = len(keys) - 1
            else:
                myfile = open("models/" + langu + ".keys", "r")
                keys = myfile.read().split("\n")
                myfile.close()
                numKeys = len(keys) - 1
                transformer_keys[langu] = keys

            if (langu in transformer_engines.keys()):
                model = transformer_engines[langu]
            else:
                model = MultiLabelClassificationModel(
                    'roberta',
                    "models/" + langu + '_transformer',
                    num_labels=numKeys,
                    use_cuda=False,
                    args={
                        'reprocess_input_data': True,
                        'overwrite_output_dir': True,
                        'num_train_epochs': 15,
                        "train_batch_size": 16,
                        "eval_batch_size": 16,
                        'no_cache': True,
                        'use_cached_eval_features': False,
                        'save_model_every_epoch': False
                    })
                transformer_engines[langu] = model

            predictions, raw_outputs = model.predict([text])
            for x in range(numKeys):
                if (predictions[0][x] == 1):
                    if (str(self.nlu_parsing["intent"]["intentName"]) == str(
                            keys[x])):
                        print("Transformer: " + keys[x] + ": " +
                              str(raw_outputs[0][x]))
                        return True

            return False

        except Exception as e:
            print(e)
            return None
Beispiel #9
0
 def __init__(self, enable_cuda):
     self.model_type = 'roberta'
     self.path = 'model/epoch'
     self.num_labels = 4
     self.args = {
         "reprocess_input_data": True,
         "overwrite_output_dir": True,
         "num_train_epochs": 1,
         'fp16': False
     }
     self.model = MultiLabelClassificationModel(model_type=self.model_type,
                                                model_name=self.path,
                                                num_labels=self.num_labels,
                                                args=self.args,
                                                use_cuda=enable_cuda)
Beispiel #10
0
def create_model(model_class, model_type, model_name, num_labels, weight, args,
                 use_cuda, cuda_device, **kwargs):
    if model_class == "ClassificationModel":
        return ClassificationModel(model_type, model_name, num_labels, weight,
                                   args, use_cuda, cuda_device, **kwargs)
    elif model_class == "MultiLabelClassificationModel":
        return MultiLabelClassificationModel(model_type, model_name,
                                             num_labels, weight, args,
                                             use_cuda, cuda_device, **kwargs)
    elif model_class == "QuestionAnsweringModel":
        return QuestionAnsweringModel(model_type, model_name, args, use_cuda,
                                      cuda_device, **kwargs)
    elif model_class == "NERModel":
        return NERModel(model_type,
                        model_name,
                        args=args,
                        use_cuda=use_cuda,
                        cuda_device=cuda_device,
                        **kwargs)
    elif model_class == "T5Model":
        args = T5Args()
        args.use_multiprocessed_decoding = False
        return T5Model(model_type,
                       model_name,
                       args=args,
                       use_cuda=use_cuda,
                       cuda_device=cuda_device,
                       **kwargs)
    else:
        raise ValueError(
            "{} is either invalid or not yet implemented.".format(model_class))
Beispiel #11
0
class Epoch:
    def __init__(self, enable_cuda):
        self.model_type = 'roberta'
        self.path = 'model/epoch'
        self.num_labels = 4
        self.args = {
            "reprocess_input_data": True,
            "overwrite_output_dir": True,
            "num_train_epochs": 1,
            'fp16': False
        }
        self.model = MultiLabelClassificationModel(model_type=self.model_type,
                                                   model_name=self.path,
                                                   num_labels=self.num_labels,
                                                   args=self.args,
                                                   use_cuda=enable_cuda)

    def preprocess(self, text):
        result = ''
        for line in text:
            result = result + ' </br> ' + line
        return result[7:]

    def predict(self, text):
        predictions, values = self.model.predict([self.preprocess(text)])
        return predictions[0]
Beispiel #12
0
    def predict(self, algorithm):
        # TODO when multiple algorithms available, implement multiple algorithms here use the condition algorithm
        model = MultiLabelClassificationModel('roberta', 'checkpoint-17315-epoch-5', num_labels=5,
                                              args={"reprocess_input_data": True, 'use_cached_eval_features': False},
                                              use_cuda=False)
        df_predict = multiple_sentences(self.df_new, model)

        return df_predict
Beispiel #13
0
def TrainModelForMultiLabel(algorithm,
                            base,
                            training_df,
                            num_labels,
                            args,
                            weight=None):
    from simpletransformers.classification import MultiLabelClassificationModel

    # Create a TransformerModel
    model = MultiLabelClassificationModel(algorithm,
                                          base,
                                          num_labels=num_labels,
                                          args=args,
                                          pos_weight=weight)

    model.train_model(training_df)
    return model
Beispiel #14
0
	def __init__(self):
		device = str(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
		if device=='cpu':
			use_cuda = False
		else:
			use_cuda = True
			
		model_args = {"use_multiprocessing": False}
		#model = MultiLabelClassificationModel('roberta', 'checkpoint-3610-epoch-5',use_cuda=False)
		self.model = MultiLabelClassificationModel('roberta', '../models/checkpoint-3115-epoch_5',use_cuda=use_cuda,args=model_args)
		self.class_name = ['atis_flight', 'atis_flight_time', 'atis_airfare', 'atis_aircraft',
       'atis_ground_service', 'atis_airport', 'atis_airline',
       'atis_distance', 'atis_abbreviation', 'atis_ground_fare',
       'atis_quantity', 'atis_city', 'atis_flight_no', 'atis_capacity',
       'atis_flight#atis_airfare', 'atis_meal', 'atis_restriction',
       'atis_airline#atis_flight_no',
       'atis_ground_service#atis_ground_fare',
       'atis_airfare#atis_flight_time', 'atis_cheapest']
Beispiel #15
0
    def __init__(self,
                 num_labels,
                 train_df=None,
                 pos_weight=[],
                 path_to_model='./path_to_model/',
                 force_retrain=True,
                 num_train_epochs=3):
        print('Initializing ... ')
        self.num_labels = num_labels
        self.path_to_model = path_to_model
        self.num_train_epochs = num_train_epochs
        if pos_weight:
            self.pos_weight = pos_weight

        if (force_retrain) or (
                not os.path.exists(path_to_model + 'config.json')):
            train_df = self.process_data_frame(train_df)

            self.model = MultiLabelClassificationModel(
                model_type='bert',
                model_name='bert-base-cased',
                num_labels=self.num_labels,
                args={
                    'output_dir': self.path_to_model,
                    'reprocess_input_data': True,
                    'overwrite_output_dir': True,
                    'pos_weight': self.pos_weight,
                    'num_train_epochs': self.num_train_epochs
                })
            self.train(train_df)

        self.model = MultiLabelClassificationModel(
            model_type='bert',
            model_name=self.path_to_model,
            num_labels=self.num_labels,
            args={
                'output_dir': self.path_to_model,
                'reprocess_input_data': True,
                'overwrite_output_dir': True,
                'pos_weight': self.pos_weight,
                'num_train_epochs': self.num_train_epochs
            })
        print('Initializing Finished!')
Beispiel #16
0
def train_eval(train_df, eval_df, output_dirp):
    """
    Train and eval test a model
    :param train_df:
    :param eval_df:
    :param output_dirp:
    :return:
    """
    print(train_df.head())

    # Define model
    model = MultiLabelClassificationModel(
        settings.MODEL_SETTINGS["model_type"],
        settings.MODEL_SETTINGS["model_name"],
        num_labels=num_labels,
        args=settings.MODEL_SETTINGS["train_args"],
    )

    # Write train
    Path(output_dirp).mkdir(parents=True, exist_ok=True)
    train_fp = Path(output_dirp) / "trainset.tsv"
    train_df.to_csv(train_fp, sep="\t", index=False)

    # reload train for testing
    train_df = pd.read_csv(train_fp, sep="\t", converters={"labels": literal_eval})
    # write and reload eval set for testing
    eval_fp = Path(output_dirp) / "testset.tsv"
    eval_df.to_csv(eval_fp, sep="\t", index=False)
    eval_df = pd.read_csv(eval_fp, sep="\t", converters={"labels": literal_eval})

    # Set tensorflow_dir in model args to run dir
    model.args["tensorboard_dir"] = Path(output_dirp) / "tensorboard/"
    model.args["cache_dir"] = (
        Path(output_dirp) / "cache/"
    )  # to ensure no weights are shared
    model.args["output_dir"] = output_dirp  # is redundant

    # Train the model
    print(f"Training model with args: {model.args}")
    model.train_model(train_df, output_dir=output_dirp)

    # Evaluate the model on eval set
    result, model_outputs, _ = model.eval_model(eval_df)

    # Write model result and outputs
    eval_df["y_pred"] = model_outputs.tolist()
    predictions_fp = Path(output_dirp) / "testset_with_predictions.tsv"
    eval_df.to_csv(predictions_fp, sep="\t", index=False)

    with open(Path(output_dirp) / "result.json", "wt") as result_out:
        json.dump(result, result_out)

    return result, model_outputs
Beispiel #17
0
    def __init__(self,
                 num_labels,
                 train_df,
                 path_to_model='./path_to_model/',
                 force_retrain=False,
                 num_train_epochs=5):
        print('Initializing ... ')
        self.num_labels = num_labels
        self.path_to_model = path_to_model
        self.num_train_epochs = num_train_epochs

        if (force_retrain) or (
                not os.path.exists(path_to_model + 'config.json')):
            train_df = self.process_data_frame(train_df)
            print(train_df.head())

            self.model = MultiLabelClassificationModel(
                model_type='bert',
                model_name='bert-base-cased',
                num_labels=self.num_labels,
                args={
                    'output_dir': self.path_to_model,
                    'reprocess_input_data': True,
                    'overwrite_output_dir': True,
                    'num_train_epochs': self.num_train_epochs
                })
            self.train(train_df)
            del self.model
            gc.collect()
            torch.cuda.empty_cache()

        self.model = MultiLabelClassificationModel(
            model_type='bert',
            model_name=self.path_to_model,
            num_labels=self.num_labels,
            args={
                'output_dir': self.path_to_model,
                'reprocess_input_data': True,
                'overwrite_output_dir': True,
                'num_train_epochs': self.num_train_epochs
            })
        print('Initializing Finished!')
Beispiel #18
0
    def train_model(self, training_data, training_args, base_model_path, test_data):
        model = MultiLabelClassificationModel(
            "bert",
            base_model_path,
            num_labels=8,
            use_cuda = False, # Highly recommended to set use_cuda = True to ultilize GPU (if available) for training
            args = training_args,
        )

        temp = [self.reader.assign_label(s) for s in training_data]

        test_data = [self.reader.assign_label(s) for s in test_data]
        test_text = [s[0] for s in test_data]
        test_label = [s[1] for s in test_data]

        eval_df = pd.DataFrame(test_data, columns=['text', 'labels'])

        model.train_model(pd.DataFrame(temp, columns=['text', 'labels']), eval_df = eval_df, f1_macro = f1_evaluate)

        return model
def load_model(args, task="mlc", name="roberta", from_path="roberta-base"):
    """
    Loads a pre-trained PyTorch model.
    """
    if task == "mlm":
        return LanguageModelingModel(name, from_path, args=args)
    elif task == "mlc":
        return MultiLabelClassificationModel(name, from_path, num_labels=3, args=args)
    else:
        raise NotImplementedError(
            "Choose 'mlm' for Masked Language Modeling or 'mlc' for Multilabel Classification!"
        )
def test_multilabel_classification(model_type, model_name):
    # Train and Evaluation data needs to be in a Pandas Dataframe containing at
    # least two columns, a 'text' and a 'labels' column. The `labels` column
    # should contain multi-hot encoded lists.
    train_data = [["Example sentence 1 for multilabel classification.", [1, 1, 1, 1, 0, 1]]] + [
        ["This is another example sentence. ", [0, 1, 1, 0, 0, 0]]
    ]
    train_df = pd.DataFrame(train_data, columns=["text", "labels"])

    eval_data = [
        ["Example eval sentence for multilabel classification.", [1, 1, 1, 1, 0, 1]],
        ["Example eval senntence belonging to class 2", [0, 1, 1, 0, 0, 0]],
    ]
    eval_df = pd.DataFrame(eval_data)

    # Create a MultiLabelClassificationModel
    model = MultiLabelClassificationModel(
        model_type,
        model_name,
        num_labels=6,
        args={"no_save": True, "reprocess_input_data": True, "overwrite_output_dir": True, "num_train_epochs": 1},
        use_cuda=False,
    )

    # Train the model
    model.train_model(train_df)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(eval_df)

    predictions, raw_outputs = model.predict(["This thing is entirely different from the other thing. "])
 def fit(self, text_series, labels):
     labels = self.one_hot_encoder(labels, True)
     print("fitting model, artifacts will be saved to: ", self.output_path)
     self.model = MultiLabelClassificationModel(
         self.model_type,
         self.model_path,
         num_labels=np.array(labels).shape[1],
         use_cuda=self.use_cuda,
         args={
             'reprocess_input_data': True,
             'output_dir': self.output_path,
             'overwrite_output_dir': True,
             'num_train_epochs': self.ephocs,
             'save_steps': 0
         })
     train_df = pd.DataFrame({
         "text": text_series,
         "labels": labels.tolist()
     })
     self.model.train_model(train_df)
     self.save_enoder()
     return self.output_path
Beispiel #22
0
 def __init__(self):
     super().__init__()
     self.model = MultiLabelClassificationModel(
         'roberta',
         MODEL,
         use_cuda=True,
         num_labels=len(df.label.iloc[0]),
         args={
             'reprocess_input_data': True,
             'overwrite_output_dir': True,
             'num_train_epochs': 5
         })
     self.loss = torch.nn.BCEWithLogitsLoss()
Beispiel #23
0
def run_experiment(model_name, pretrain_name):

    # Specifies models.
    args = {"train_batch_size": 8,
            "eval_batch_size": 8,
            "num_train_epochs": 3,
            "learning_rate": 1e-5,
            "warmup_ratio": 0.1,
            "warmup_steps": 100,
            "reprocess_input_data": True,
            "overwrite_output_dir": True}
    model = MultiLabelClassificationModel(model_name, pretrain_name, num_labels=2, args=args)

    # Train models.
    model.train_model(train_df)
    
    # Predicts labels.
    for split, df in zip(["train", "test", "predict"], [train_df, test_df, predict_df]):
        if split in {"train", "test"} or [split, model_name] == ["predict", "roberta"]:
            _, predictions = model.predict(df["text"])
            df["predictions"] = [list(p) for p in predictions]
            df.to_csv(os.path.join(data_path, split + "_" + model_name + "_pred.csv"), index=False)
Beispiel #24
0
def build_or_load_model(model_name, frame_type, seed):
    if model_name == 'dummy_random':
        clf = DummyClassifier(strategy='uniform', random_state=seed)
    if model_name == 'dummy_stratified':
        clf = DummyClassifier(strategy='stratified', random_state=seed)
    if model_name == 'dummy_frequent':
        clf = DummyClassifier(strategy='most_frequent', random_state=seed)
    if model_name == 'logreg_unigram':
        clf = MultiOutputClassifier(
            LogisticRegression(solver='saga', random_state=seed))
    if model_name == 'logreg_bigram':
        clf = MultiOutputClassifier(
            LogisticRegression(solver='saga', random_state=seed))

    model_path_base = '/shared/2/projects/framing/models/classify/'
    model_args = {"reprocess_input_data": True, 'no_cache': True}
    if model_name == 'roberta_finetune':
        model_path = os.path.join(
            model_path_base, frame_type,
            f'11-03-20_60_epochs_default_thresh_{seed}_seed')
        clf = MultiLabelClassificationModel('roberta',
                                            model_path,
                                            cuda_device=1,
                                            args=model_args)
    if model_name == 'roberta_baseline':
        model_path = os.path.join(
            model_path_base, frame_type,
            f'roberta_baseline_11-05-20_60_epochs_default_thresh_{seed}_seed')
        clf = MultiLabelClassificationModel('roberta',
                                            model_path,
                                            cuda_device=1,
                                            args=model_args)
    # if model_name == 'roberta_all':
    # 	model_path = os.path.join(model_path_base,'all_frames',f'10-08-20_60_epochs_default_thresh_{seed}_seed')
    # 	clf = MultiLabelClassificationModel('roberta',model_path,cuda_device=0)

    return clf
Beispiel #25
0
def return_out(url, justLoad=False):
    global loaded
    global model_
    if (not loaded):
        model_ = MultiLabelClassificationModel('distilbert',
                                               str(os.path.dirname(__file__)) +
                                               '/model_weights/',
                                               use_cuda=False,
                                               num_labels=12,
                                               args=args)
        checkpoint = torch.load(model_path, map_location='cpu')
        model_.model.load_state_dict(checkpoint)
        model_.model.eval()
        loaded = True
        print("\n\nModel Loaded\n\n")
        if (justLoad):
            return -1

    reddit = praw.Reddit("     hidden credentials     ")
    print(1)
    sub = reddit.submission(url=url)
    data = [sub.title, sub.url, sub.selftext, sub.link_flair_text]
    print("\n", data[0], "\n")

    data[1] = processURL(data[1])

    print("\n", data[1], "\n")

    preds = np.argmax(model_.predict([data[0] + ' ' + data[1]])[1])

    print("\n Preds Loaded \n")

    preds = enc[preds + 1]
    actual = data[3]

    return actual, preds
def load_model(args, task='mlc', name='roberta', from_path='roberta-base'):
    '''
        Loads a pre-trained PyTorch model.
    '''
    if task == 'mlm':
        return LanguageModelingModel(name, from_path, args=args)
    elif task == 'mlc':
        return MultiLabelClassificationModel(name,
                                             from_path,
                                             num_labels=3,
                                             args=args)
    else:
        raise NotImplementedError(
            "Choose 'mlm' for Masked Language Modeling or 'mlc' for Multilabel Classification!"
        )
Beispiel #27
0
    def load_pretrained_model(self, name="roberta", from_path="roberta-base"):
        """
        Loads and returns the fine-tuned model.

        args :
            name      : the name of the fine-tuned model.
            from_path : the path where the model is saved to.
        """
        if from_path.endswith("/"):
            from_path = from_path[:-1]

        with open(f"{from_path}/model_args.json") as f:
            model_args = json.load(f)

        return MultiLabelClassificationModel(
            name,
            from_path,
            num_labels=3,
            args=model_args,
            use_cuda=self.device == torch.device("cuda"),
        )
Beispiel #28
0
print("Importing packages...")
from simpletransformers.classification import MultiLabelClassificationModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

EPOCHS = 50
LEARNING_RATE = 2e-4
BATCH_SIZE = 10
LR_STRING = "2e-4"

file = pd.read_csv("Data/manually_labelled_gsm_data.csv")

print("Importing model...")

model = MultiLabelClassificationModel(
    'roberta',
    '../Experiment_2/BERT_OUTPUTS/Learning_rate_2e-4/Output_ENCODE/epoch69/',
    use_cuda=True)

print("Predicting for testset...")
_, test_raw_outputs = model.predict(file['Input'])

test_outputs = pd.DataFrame(test_raw_outputs)
test_outputs.to_csv(
    f"Results/manually_labelled_gsm_results_RoBERTa_69epochs.csv")
Beispiel #29
0
        return (np.asarray(data) * self.stddev) + self.means


scl = pickle.load(open('bcms.scaler', 'rb'))

# Setting optional model configuration
model_args = {
    "regression": True,
    "do_lower_case": True,
    "eval_batch_size": 64,
}

# Create a ClassificationModel
model = MultiLabelClassificationModel(
    "electra",
    "CLASSLA/bcms-bertic-geo",
    num_labels=2,
    loss_fct="MAELoss",
    args=model_args,
)

text = [
    'Kaj si rekel', 'Ne mogu to da uradim', 'Sjutra idemo na more',
    'Skuvaj kahvu, bona!'
]
pred = model.predict(text)
pred_inv = scl.inverse_transform(pred)[0]
pred_rev = reverse_geocode.search(pred_inv)
for t, c, r in zip(text, pred_inv, pred_rev):
    print(t, c, r)
  print(len(l))

big_data = labeled_data[0][:15000] + labeled_data[1][10000:40000] + labeled_data[2][30000:60000] + labeled_data[3][:30000]

shuffle(big_data)

train_data = big_data[:int(0.8*len(big_data))]
test_data = big_data[int(0.8*len(big_data)):]

train_df = pd.DataFrame(train_data, columns=["text", "labels"])
eval_df = pd.DataFrame(test_data)

# Load the model
model = MultiLabelClassificationModel(
    "roberta",
    "roberta-base",
    num_labels = 4,
    args={"reprocess_input_data": True, "overwrite_output_dir": True, "num_train_epochs": 5, 'fp16': False},
)

# Train model
model.train_model(train_df)

sub = test_data

test_label = []
test_text = []
predictions = []

for i in range(len(sub)):
  test_text.append(sub[i][0])
  test_label.append(sub[i][1])