Example #1
0
def predict_camembert(df: pd.DataFrame) -> pd.DataFrame:
    """
    predict the sentiment of reviews
    :param df: dataframe with reviews
    :return: dataframe: dataframe with prediction of reviews
    """
    df['space'] = ' '
    df['comments'] = df[['titre', 'space', 'comment']].fillna('').sum(axis=1)
    df = df.dropna(subset=['comments'], axis="rows")
    comments = df['comments'].to_list()
    # camemBERT
    state_dict = torch.load("camemBERT_38000_state_dict.pt",
                            map_location=torch.device('cpu'))
    model = CamembertForSequenceClassification.from_pretrained(
        'camembert-base', num_labels=2, state_dict=state_dict)

    # Initialize CamemBERT tokenizer
    tokenizer = CamembertTokenizer.from_pretrained('camembert-base',
                                                   do_lower_case=True)

    # Encode the comments
    tokenized_comments_ids = [
        tokenizer.encode(comment, add_special_tokens=True, max_length=MAX_LEN)
        for comment in comments
    ]
    # Pad the resulted encoded comments
    tokenized_comments_ids = pad_sequences(tokenized_comments_ids,
                                           maxlen=MAX_LEN,
                                           dtype="long",
                                           truncating="post",
                                           padding="post")

    # Create attention masks
    attention_masks = []
    for seq in tokenized_comments_ids:
        seq_mask = [float(i > 0) for i in seq]
        attention_masks.append(seq_mask)

    prediction_inputs = torch.tensor(tokenized_comments_ids)
    prediction_masks = torch.tensor(attention_masks)

    predictions = []
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(prediction_inputs.to(device),
                        token_type_ids=None,
                        attention_mask=prediction_masks.to(device))
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        predictions.extend(np.argmax(logits, axis=1).flatten())

    df = pd.DataFrame(
        data={
            "site": df["site"],
            "date": df["date"],
            "review": df["review"],
            "sentiment": predictions
        })
    return df
Example #2
0
 def __init__(self, DIR, filename):
     self.path = os.path.join(DIR, filename)
     self.device = torch.device(
         "cuda:0" if torch.cuda.is_available() else "cpu")
     self.tokenizer = CamembertTokenizer.from_pretrained(
         config["BERT_MODEL"])
     classifier = CamembertForSequenceClassification.from_pretrained(
         config['BERT_MODEL'], num_labels=len(config["CLASS_NAMES"]))
     classifier.load_state_dict(
         torch.load(self.path, map_location=self.device))
     classifier = classifier.eval()
     self.classifier = classifier.to(self.device)
Example #3
0
    def init_nlp(self, model_path="model_nlp.pt"):
        try:
            nlp = spacy.load("fr_core_news_sm")
        except:
            os.system("python -m spacy download fr")
            os.system("python -m spacy download fr_core_news_md")

        # load model camembert
        state_dict = torch.load(model_path, map_location=torch.device('cpu'))
        #print("Loading trained model...")
        model = CamembertForSequenceClassification.from_pretrained(
            'camembert-base', num_labels=2, state_dict=state_dict)
        #print("Trained model loaded!")

        # load TOKENIZER camembert
        TOKENIZER = CamembertTokenizer.from_pretrained('camembert-base',
                                                       do_lower_case=True)
        return model, TOKENIZER
batch_size = 32

train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size)
 
validation_dataloader = DataLoader(
            validation_dataset,
            sampler = SequentialSampler(validation_dataset),
            batch_size = batch_size)

#%%
model = CamembertForSequenceClassification.from_pretrained(
    'camembert-base',
    num_labels = 2)
#%%
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # Learning Rate, plus petit pour éviter le 
                            # catastroph forgetting
                  eps = 1e-8) # Epsilon
epochs = 3

# On va stocker nos tensors sur mon cpu : je n'ai pas mieux
device = torch.device("cpu")
 
# Pour enregistrer les stats a chaque epoque
training_stats = []
#%%
for epoch in range(0, epochs):
Example #5
0
        input_ids, labels, attention_masks, token_type_ids, lineNumbers, file_names = \
            input_ids_tmp, labels_tmp, attention_masks_tmp, token_type_ids_tmp, lineNumbers_tmp, file_names_tmp

    else:

        input_ids = np.append(input_ids, input_ids_tmp, axis=0)
        labels = np.append(labels, labels_tmp, axis=0)
        attention_masks = np.append(attention_masks, attention_masks_tmp, axis=0)
        token_type_ids = np.append(token_type_ids, token_type_ids_tmp, axis=0)
        lineNumbers = np.append(lineNumbers, lineNumbers_tmp, axis=0)
        file_names = np.append(file_names, file_names_tmp, axis=0)

print('Loading BERT model...')

if curr_lang == 'French':
    model = CamembertForSequenceClassification.from_pretrained(output_dir)
else:
    model = BertForSequenceClassification.from_pretrained(output_dir)


if torch.cuda.is_available():

    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))


else:
    print('No GPU available, using the CPU instead.')
Example #6
0
    # Create the DataLoader for our validation set.
    validation_data = TensorDataset(validation_inputs, validation_masks,
                                    validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data,
                                       sampler=validation_sampler,
                                       batch_size=batch_size)

    ##### loading the model and beging the training
    print('loading the model...')
    model = CamembertForSequenceClassification.from_pretrained(
        'camembert-base',  # Use the 12-layer BERT model, with an uncased vocab.
        num_labels=len(
            index2canonical
        ),  # The number of output labels for multi-class classification.   
        output_attentions=False,  # Whether the model returns attentions weights.
        output_hidden_states=
        False,  # Whether the model returns all hidden-states.
    )
    # train the model on GPU
    model.cuda()

    # Note: AdamW is a class from the huggingface library (as opposed to pytorch)
    optimizer = AdamW(
        model.parameters(),
        lr=2e-5,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
        eps=1e-8  # args.adam_epsilon  - default is 1e-8.
    )

    # Number of training epochs
Example #7
0
    def load_class(self):
        # Load the tokenizer.
        if self.verbose == True:
            print('Loading {} class...'.format(self.model_name))
        if self.model_name == 'bert':
            # Load BertForSequenceClassification, the pretrained BERT model with a single
            # linear classification layer on top.
            self.model = BertForSequenceClassification.from_pretrained(
                self.
                model_type,  # Use the 12-layer BERT model, with an uncased vocab.
                # You can increase this for multi-class tasks.
                num_labels=self.num_labels,
                output_attentions=
                False,  # Whether the model returns attentions weights.
                output_hidden_states=
                False,  # Whether the model returns all hidden-states.
            )
        if self.model_name == 'distilbert':
            self.model = DistilBertForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'albert':
            self.model = AlbertForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'bart':
            if self.task == 'classification':
                self.model = BartForSequenceClassification.from_pretrained(
                    self.model_type,
                    num_labels=self.num_labels,
                    output_attentions=False,
                    output_hidden_states=False,
                )
            if self.task == 'summarize':
                self.model = BartForConditionalGeneration.from_pretrained(
                    self.model_type)

        if self.model_name == 'xlnet':
            self.model = XLNetForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'roberta':
            self.model = RobertaForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'camenbert':
            self.model = CamembertForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'flaubert':
            self.model = FlaubertForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'gpt2':
            self.model = GPT2LMHeadModel.from_pretrained(self.model_type)
 def __init__(self, bert_model, num_classes):
     super(Umberto, self).__init__()
     self.encoder = CamembertForSequenceClassification.from_pretrained(
         bert_model, num_labels=num_classes)