Esempio n. 1
0
    def __init__(self, device):
        super(RobertaTweetEmbedding, self).__init__(device=device)
        self.config = RobertaConfig.from_pretrained(
            '../data/models/BERTweet_base_transformers/config.json')
        self.model = RobertaModel.from_pretrained(
            '../data/models/BERTweet_base_transformers/model.bin',
            config=self.config)
        self.model.eval(
        )  # disable dropout (or leave in train mode to finetune)
        self.model.to(self.device)
        self.pad_token_id = self.config.pad_token_id
        self.embedding_dim = self.model.config.hidden_size

        # Load BPE encoder
        parser = argparse.ArgumentParser()
        parser.add_argument(
            '--bpe-codes',
            default="../data/models/BERTweet_base_transformers/bpe.codes",
            required=False,
            type=str,
            help='path to fastBPE BPE')
        args = parser.parse_args()
        self.bpe = fastBPE(args)

        # Load the dictionary
        self.vocab = Dictionary()
        self.vocab.add_from_file(
            "../data/models/BERTweet_base_transformers/dict.txt")
Esempio n. 2
0
    def __init__(self, config: Bunch) -> None:
        pl.LightningModule.__init__(self)
        self.config = config

        bpe_codes_path = os.path.join(
            config.pretrained_model_base_path,
            "BERTweet_base_transformers/bpe.codes",
        )
        bpe = fastBPE(Namespace(bpe_codes=bpe_codes_path))
        vocab = Dictionary()
        vocab.add_from_file(
            os.path.join(
                config.pretrained_model_base_path,
                "BERTweet_base_transformers/dict.txt",
            ))

        tokenizer = BertweetTokenizer(self.config.max_tokens_per_tweet, bpe,
                                      vocab)
        self.data_processor = BertweetDataProcessor(config, tokenizer)

        model_config = RobertaConfig.from_pretrained(
            os.path.join(
                config.pretrained_model_base_path,
                "BERTweet_base_transformers/config.json",
            ))
        self.model = RobertaForSequenceClassification.from_pretrained(
            os.path.join(
                config.pretrained_model_base_path,
                "BERTweet_base_transformers/model.bin",
            ),
            config=model_config,
        )
        self.loss = CrossEntropyLoss()
Esempio n. 3
0
def mark(line, masked_line):
    # Load PhoBERT-base in fairseq
    # from fairseq.models.roberta import RobertaModel
    # phobert_mask = RobertaModel.from_pretrained('PhoBERT_base_fairseq', checkpoint_file='model.pt')
    # phobert_mask.eval()  # disable dropout (or leave in train mode to finetune)

    # Incorporate the BPE encoder into PhoBERT-base
    args = parser_mask.parse_args()
    phobert_mask.bpe = fastBPE(args)  #Incorporate the BPE encoder into PhoBERT

    # INPUT TEXT IS WORD-SEGMENTED!
    # line = "Tôi là sinh_viên trường đại_học Công_nghệ ."

    # Extract the last layer's features
    subwords = phobert_mask.encode(line)
    last_layer_features = phobert_mask.extract_features(subwords)
    # assert last_layer_features.size() == torch.Size([1, 9, 768])

    # Extract all layer's features (layer 0 is the embedding layer)
    all_layers = phobert_mask.extract_features(subwords,
                                               return_all_hiddens=True)
    assert len(all_layers) == 13
    assert torch.all(all_layers[-1] == last_layer_features)

    # Filling marks
    # masked_line = 'Tôi là  <mask> trường đại_học Công_nghệ .'
    topk_filled_outputs = phobert_mask.fill_mask(masked_line, topk=1)
    return topk_filled_outputs
Esempio n. 4
0
    def __init__(self, data_dir, max_length=150, remove_negative_pair=True):
        super(VNNewsDataset, self).__init__()
        self.data_dir = data_dir
        self.max_length = max_length

        self.sentence_1 = open(os.path.join(self.data_dir, 'Sentences_1.txt'),
                               mode='r',
                               encoding='utf-8-sig').read().split('\n')

        self.sentence_2 = open(os.path.join(self.data_dir, 'Sentences_2.txt'),
                               mode='r',
                               encoding='utf-8-sig').read().split('\n')

        self.labels = open(os.path.join(self.data_dir, 'Labels.txt'),
                           mode='r',
                           encoding='utf-8-sig').read().split('\n')

        self.bpe = fastBPE(BPEConfig)
        self.vocab = Dictionary()
        self.vocab.add_from_file(
            os.path.join(os.getcwd(), '../pretrained',
                         'PhoBERT_base_transformers', 'dict.txt'))
        self.rdr_segmenter = VnCoreNLP(os.path.join('../vncorenlp',
                                                    'VnCoreNLP-1.1.1.jar'),
                                       annotators='wseg',
                                       max_heap_size='-Xmx500m')

        if remove_negative_pair is True:
            self.remove_negative_pair()
Esempio n. 5
0
 def __init__(self, vncore=True):
     """
     Hacky way to run VnCoreNLP tokenizer with PhoBERT
     :param vncore: Set it to `False` if your sentences are already tokenized by VnCoreNLP
     """
     self.dictionary = Dictionary.load(open(DICT_PATH))
     self.annotator = None
     self.vncore = vncore
     self.bpe = fastBPE(args)
Esempio n. 6
0
def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    valid_iter = data_loader.Dataloader(args,
                                        load_dataset(args,
                                                     'valid',
                                                     shuffle=False),
                                        args.batch_size,
                                        device,
                                        shuffle=False,
                                        is_test=False)

    parser = argparse.ArgumentParser()
    parser.add_argument('--bpe-codes',
                        default="/content/PhoBERT_base_transformers/bpe.codes",
                        required=False,
                        type=str,
                        help='path to fastBPE BPE')
    args1, unknown = parser.parse_known_args()
    bpe = fastBPE(args1)

    # Load the dictionary
    vocab = Dictionary()
    vocab.add_from_file("/content/PhoBERT_base_transformers/dict.txt")

    tokenizer = bpe
    symbols = {
        'BOS': vocab.indices['[unused0]'],
        'EOS': vocab.indices['[unused1]'],
        'PAD': vocab.indices['[PAD]'],
        'EOQ': vocab.indices['[unused2]']
    }

    valid_loss = abs_loss(model.generator,
                          symbols,
                          model.vocab_size,
                          train=False,
                          device=device)

    trainer = build_trainer(args, device_id, model, None, valid_loss)
    stats = trainer.validate(valid_iter, step)
    return stats.xent()
Esempio n. 7
0
    def __init__(self,pretrained_path = '../input/bertweet-transformer-private/', parser=parser):
        

        self.bpe = fastBPE(args=parser.parse_args(args=[]))
        self.vocab = Dictionary()
        self.vocab.add_from_file(pretrained_path + "dict.txt")
        self.cls_token_id = 0
        self.pad_token_id = 1
        self.sep_token_id = 2
        self.pad_token = '<pad>'
        self.cls_token = '<s> '
        self.sep_token = ' </s>'
    def fit(self, sentences):
        if self.model is None:
            from fairseq.models.roberta import RobertaModel
            from fairseq.data.encoders.fastbpe import fastBPE

            self.model = RobertaModel.from_pretrained(
                'PhoBERT_base_fairseq', checkpoint_file='model.pt')
            self.model.eval()

            args = BPE()
            self.model.bpe = fastBPE(args)
        return self
Esempio n. 9
0
    def __init__(
        self,
        model_path: str,
    ) -> None:

        self.bpe = fastBPE(Args(model_path + "/bpe.codes"))
        self.vocab = Dictionary()
        self.vocab.add_from_file(f"{model_path}/dict.txt")
        self._tokenizer_lowercases = False
        self.sequence_pair_start_tokens = [Token(text="<s>", text_id=0, type_id=0)]
        self.sequence_pair_mid_tokens = [Token(text="</s>", text_id=2, type_id=0), Token(text="</s>", text_id=2, type_id=0)]
        self.sequence_pair_end_tokens = [Token(text="</s>", text_id=2, type_id=0)]
Esempio n. 10
0
 def __init__(self, pretrain="auxiliary_data/PhoBERT_base_fairseq"):
     self.phoBERT = RobertaModel.from_pretrained(pretrain,
                                                 checkpoint_file='model.pt')
     self.phoBERT.eval()
     parser = options.get_preprocessing_parser()
     parser.add_argument('--bpe-codes',
                         type=str,
                         help='path to fastBPE BPE',
                         default=pretrain + "/bpe.codes")
     args, unknown = parser.parse_known_args()
     self.phoBERT.bpe = fastBPE(
         args)  #Incorporate the BPE encoder into PhoBERT
Esempio n. 11
0
    def __init__(self, pretrained_path='./bertweet/'):

        self.bpe = fastBPE(
            SimpleNamespace(bpe_codes=pretrained_path + "bpe.codes"))
        self.vocab = Dictionary()
        self.vocab.add_from_file(pretrained_path + "dict.txt")
        self.cls_token_id = 0
        self.pad_token_id = 1
        self.sep_token_id = 2
        self.pad_token = '<pad>'
        self.cls_token = '<s>'
        self.sep_token = '</s>'
Esempio n. 12
0
 def __init__(self, max_length=512):
     self.bpe = fastBPE(BPEConfig)
     self.vocab = Dictionary()
     self.vocab.add_from_file(os.path.join(os.getcwd(),
                                           'pretrained',
                                           'PhoBERT_base_transformers',
                                           'dict.txt'))
     self.rdr_segmenter = VnCoreNLP(
         os.path.join('vncorenlp', 'VnCoreNLP-1.1.1.jar'),
         annotators='wseg',
         max_heap_size='-Xmx500m'
     )
     self.max_length = max_length
Esempio n. 13
0
def get_w2v_sent(arr_sent):
    # from transformers import RobertaModel
    # phobert_w2v = RobertaModel.from_pretrained(
    #     "PhoBERT_base_transformers/model.bin",
    #     config=config
    # )

    args = parser_w2v.parse_args()
    bpe = fastBPE(args)

    # Load the dictionary
    # vocab = Dictionary()
    # vocab.add_from_file("PhoBERT_base_transformers/dict.txt")
    # line = "Tôi là sinh_viên trường đại_học Công_nghệ ."
    line = ' '.join(arr_sent[0])
    # Encode the line using fastBPE & Add prefix <s> and suffix </s>
    subwords = '<s> ' + bpe.encode(line) + ' </s>'

    # Map subword tokens to corresponding indices in the dictionary
    input_ids = vocab.encode_line(subwords,
                                  append_eos=False,
                                  add_if_not_exist=False).long().tolist()

    # Convert into torch tensor
    all_input_ids = torch.tensor([input_ids], dtype=torch.long)

    # Extract features
    with torch.no_grad():
        features = phobert_w2v(all_input_ids)

    # Represent each word by the contextualized embedding of its first subword token
    # i. Get indices of the first subword tokens of words in the input sentence
    listSWs = subwords.split()
    firstSWindices = []
    for ind in range(1, len(listSWs) - 1):
        if not listSWs[ind - 1].endswith("@@"):
            firstSWindices.append(ind)

    # ii. Extract the corresponding contextualized embeddings
    vector_sent = []
    words = line.split()
    assert len(firstSWindices) == len(words)
    vectorSize = features[0][0, 0, :].size()[0]
    for word, index in zip(words, firstSWindices):
        vector_sent.append([
            features[0][0, index, :][_ind].item() for _ind in range(vectorSize)
        ])
        # print(word, " --> " ,[features[0][0, index, :][_ind].item() for _ind in range(vectorSize)])
    result = np.array(vector_sent)
    result_vec = np.sum(result, axis=0)
    return result_vec
def get_input_ids_and_att_masks(
        lines: pd.core.series.Series) -> Tuple[List, List]:
    # Load BPE Tokenizer
    print('Load BPE Tokenizer')
    parser: argparse.ArgumentParser = argparse.ArgumentParser()
    parser.add_argument('--bpe-codes',
                        default="./BERTweet_base_transformers/bpe.codes",
                        required=False,
                        type=str,
                        help='path to fastBPE BPE')
    args: fastBPE = parser.parse_args()
    bpe: argparse.Namespace = fastBPE(args)

    vocab: Dictionary = Dictionary()
    vocab.add_from_file("./BERTweet_base_transformers/dict.txt")

    input_ids: List = []
    attention_masks: List = []
    for line in lines:
        # (1) Tokenize the sentence
        # (2) Add <CLS> token and <SEP> token (<s> and </s>)
        # (3) Map tokens to IDs
        # (4) Pad/Truncate the sentence to `max_length`
        # (5) Create attention masks for [PAD] tokens
        subwords: str = '<s> ' + \
            bpe.encode(line.lower()) + ' </s>'  # (1) + (2)
        line_ids: List = vocab.encode_line(
            subwords, append_eos=False,
            add_if_not_exist=False).long().tolist()  # (3)

        if len(line_ids) < MAX_LENGTH:
            paddings: torch.tensor = torch.ones(
                (1, MAX_LENGTH - len(line_ids)), dtype=torch.long)
            # convert the line_ids to torch tensor
            tensor_line_ids: torch.tensor = torch.cat(
                [torch.tensor([line_ids], dtype=torch.long), paddings], dim=1)
            line_attention_masks: torch.tensor = torch.cat([
                torch.ones((1, len(line_ids)), dtype=torch.long),
                torch.zeros((1, MAX_LENGTH - len(line_ids)), dtype=torch.long)
            ],
                                                           dim=1)
        elif len(line_ids) > MAX_LENGTH:
            tensor_line_ids: torch.tensor = torch.tensor(
                [line_ids[0:MAX_LENGTH]], dtype=torch.long)
            line_attention_masks: torch.tensor = torch.ones((1, MAX_LENGTH),
                                                            dtype=torch.long)

        input_ids.append(tensor_line_ids)
        attention_masks.append(line_attention_masks)

    return tuple([input_ids, attention_masks])
    def __init__(self, device, model):
        super(HaggingFaceEmbeddings, self).__init__(device=device)
        self.model_keys = self.get_model_keys()
        MODELS = {'bert-base-uncased': (BertModel, BertTokenizer, 'bert-base-uncased'),
                  'openai-gpt': (OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'),
                  'transfo-xl-wt103': (TransfoXLModel, TransfoXLTokenizer, 'transfo-xl-wt103'),
                  'gpt2': (GPT2Model, GPT2Tokenizer, 'gpt2'),
                  'xlm-mlm-enfr-1024': (XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'),
                  'xlnet-base-cased': (XLNetModel, XLNetTokenizer, 'xlnet-base-cased'),
                  'roberta-base': (RobertaModel, RobertaTokenizer, 'roberta-base'),
                  'distilbert-base-uncased': (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
                  'ctrl': (CTRLModel, CTRLTokenizer, 'ctrl'),
                  'distilbert-base-cased': (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'),
                  'camembert': (CamembertModel, CamembertTokenizer, 'camembert-base'),
                  'albert-base-v2': (AlbertModel, AlbertTokenizer, 'albert-base-v2'),
                  'xlm-roberta-base': (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
                  'flaubert_base_cased': (FlaubertModel, FlaubertTokenizer, 'flaubert/flaubert_base_cased'),
                  'bart-large': (BartModel, BartTokenizer, 'facebook/bart-large'),
                  't5-small': (T5Model, T5Tokenizer, 't5-small'),
                  'electra-small-discriminator': (ElectraModel, ElectraTokenizer, 'google/electra-small-discriminator'),
                  # DiaploGPT
                  'reformer-crime-and-punishment': (ReformerModel, ReformerTokenizer, 'google/reformer-crime-and-punishment'),
                  'opus-mt-en-ROMANCE': (MarianMTModel, MarianTokenizer, 'Helsinki-NLP/opus-mt-en-ROMANCE'),
                  'longformer-base-4096': (LongformerModel, LongformerTokenizer, 'allenai/longformer-base-4096'),
                  'retribert': (RetriBertModel, RetriBertTokenizer, 'distilbert-base-uncased'),
                  'mobilebert-uncased': (MobileBertModel, MobileBertTokenizer, 'google/mobilebert-uncased')
                  }

        if model not in self.model_keys:
            assert '{} is not in keys'.format(model)


        self.model_name = MODELS[model][2]
        self.tokenizer = MODELS[model][1].from_pretrained(self.model_name)
        self.model = MODELS[model][0].from_pretrained(self.model_name)
        self.model.eval()  # disable dropout (or leave in train mode to finetune)
        self.model.to(self.device)
        self.pad_token_id = self.tokenizer.pad_token_id
        self.embedding_dim = self.model.config.hidden_size

        parser = argparse.ArgumentParser()
        parser.add_argument('--bpe-codes',
                            default="../data/models/BERTweet_base_transformers/bpe.codes",
                            required=False,
                            type=str,
                            help='path to fastBPE BPE'
                            )
        args = parser.parse_args()
        self.bpe = fastBPE(args)
        self.max_seq_length = 256
Esempio n. 16
0
    def __init__(self):
        # Load the model in fairseq
        MODEL_DIR = '/usr/local/software/pretrained-models/PhoBERT_base_fairseq'
        __checkpoint_file = join(MODEL_DIR, 'model.pt')
        self.__phoBERT = RobertaModel.from_pretrained(
            MODEL_DIR, checkpoint_file=__checkpoint_file)
        self.__phoBERT.eval(
        )  # disable dropout (or leave in train mode to finetune

        # Khởi tạo Byte Pair Encoding cho PhoBERT
        class BPE():
            bpe_codes = '/usr/local/software/pretrained-models/PhoBERT_base_fairseq/bpe.codes'

        args = BPE()
        self.__phoBERT.bpe = fastBPE(
            args)  # Incorporate the BPE encoder into PhoBERT
Esempio n. 17
0
def _init_model(pretrain_model):
    bpe_path = os.path.join(pretrain_model, "bpe.codes")

    BERTweet = RobertaModel.from_pretrained(pretrain_model,
                                            checkpoint_file='model.pt')
    BERTweet.eval()  # disable dropout (or leave in train mode to finetune)

    # Incorporate the BPE encoder into BERTweet-base

    parser = options.get_preprocessing_parser()
    parser.add_argument('--bpe-codes',
                        type=str,
                        help='path to fastBPE BPE',
                        default=bpe_path)
    args = parser.parse_args()
    BERTweet.bpe = fastBPE(args)  # Incorporate the BPE encoder into BERTweet
    return BERTweet
Esempio n. 18
0
def load_phobert_model():

    device = torch.device("cpu")

    parser = argparse.ArgumentParser()
    parser.add_argument('--bpe-codes',
                        default=paths.bpe_codes_path,
                        required=False,
                        type=str,
                        help='path to fastBPE BPE')

    args = parser.parse_args()
    bpe = fastBPE(args)

    vn_tokenizer = VnCoreNLP(paths.vncore_jar_path,
                             annotators="wseg",
                             max_heap_size='-Xmx500m')

    # config model
    config = RobertaConfig.from_pretrained(paths.config_path,
                                           output_hidden_states=True,
                                           num_labels=3)

    model_bert = RobertaForAIViVN.from_pretrained(paths.pretrained_path,
                                                  config=config)
    # model_bert.cuda()

    # Load the dictionary
    vocab = Dictionary()
    vocab.add_from_file(paths.dict_path)
    '''
    if torch.cuda.device_count():
        print(f"Testing using {torch.cuda.device_count()} gpus")
        model_bert = nn.DataParallel(model_bert)
        tsfm = model_bert.module.roberta
    else:
        tsfm = model_bert.roberta
    '''

    model_bert = nn.DataParallel(model_bert)
    tsfm = model_bert.module.roberta

    model_bert.load_state_dict(
        torch.load(paths.phobert_path, map_location=device))

    return bpe, vn_tokenizer, model_bert, vocab
    def __init__(self,
                 pretrain_path=None,
                 n_class=config.MODEL['N_CLASS'],
                 device='cpu'):
        super(ClassifierModel, self).__init__()
        self.device = device

        from fairseq.models.roberta import RobertaModel
        self.bert_model = RobertaModel.from_pretrained(
            config.PATH['PHO_BERT'], checkpoint_file='model.pt')
        self.bert_model.bpe = fastBPE(BPE())
        self.bert_model.register_classification_head('new_task',
                                                     num_classes=n_class)
        self.bert_model.to(device=device)

        if pretrain_path != None:
            self.load_model(pretrain_path)
        self.bert_model.eval()
Esempio n. 20
0
def load_bpe_and_vocab(args):
    if args.model.model_class == 'roberta':
        args.model.bpe_codes = os.path.join(utils.PROJ_DIR,
                                            args.model.bpe_codes)
        print(
            f"Loading BPE from pretrained checkpoint at {args.model.bpe_codes}"
        )
        bpe = fastBPE(args.model)

        args.model.vocab = os.path.join(utils.PROJ_DIR, args.model.vocab)
        print(f"Loading BPE from pretrained checkpoint at {args.model.vocab}")
        vocab = Dictionary()
        vocab.add_from_file(args.model.vocab)
        print()
    else:
        bpe, vocab = None, None

    return bpe, vocab
    def loadModel(self):
        parser = argparse.ArgumentParser(description='Process some integers.')
        parser.add_argument('--bpe-codes',
                            type=str,
                            help='path to fastBPE BPE',
                            default=self.BPE_PATH)
        args = parser.parse_args("")

        phoBERT = RobertaModel.from_pretrained(self.MODEL_PATH,
                                               checkpoint_file='model.pt')
        phoBERT.eval()
        phoBERT.bpe = fastBPE(args)

        rdrsegmenter = VnCoreNLP(self.VNCORENLP_PATH,
                                 annotators="wseg",
                                 max_heap_size='-Xmx500m')

        return phoBERT, rdrsegmenter
def get_bert_embedding(lines: List[str]) -> List[torch.Tensor]:
    # Load model
    config = RobertaConfig.from_pretrained(
        "./BERTweet_base_transformers/config.json")
    BERTweet = RobertaModel.from_pretrained(
        "./BERTweet_base_transformers/model.bin", config=config)

    # Load BPE encoder
    parser = argparse.ArgumentParser()
    parser.add_argument('--bpe-codes',
                        default="./BERTweet_base_transformers/bpe.codes",
                        required=False,
                        type=str,
                        help='path to fastBPE BPE')
    args = parser.parse_args()
    bpe = fastBPE(args)

    # Load the dictionary
    vocab = Dictionary()
    vocab.add_from_file("./BERTweet_base_transformers/dict.txt")

    result: List[torch.Tensor] = []
    for i in range(len(lines)):
        line: str = lines[i]

        # Encode the line using fastBPE & Add prefix <s> and suffix </s>
        subwords = '<s> ' + bpe.encode(line) + ' </s>'

        # Map subword tokens to corresponding indices in the dictionary
        input_ids = vocab.encode_line(subwords,
                                      append_eos=False,
                                      add_if_not_exist=False).long().tolist()

        # Convert into torch tensor
        all_input_ids = torch.tensor([input_ids], dtype=torch.long)

        features = None

        with torch.no_grad():
            features = BERTweet(all_input_ids)

        result.append(features[0][:, 0, :].numpy()[0])

    return result
Esempio n. 23
0
    def __init__(self, bpe_path: str, vncorenlp_path: str, do_lower_case: bool = False):
        bpe_codes_path = os.path.join(bpe_path, BPECODE_FILE)
        vocab_file_path = os.path.join(bpe_path, VOCAB_FILE)
        
        if not os.path.isfile(bpe_codes_path):
            raise EnvironmentError(f"{BPECODE_FILE} not found in {bpe_path}")
            
        if not os.path.isfile(vocab_file_path):
            raise EnvironmentError(f"{VOCAB_FILE} not found in {bpe_path}")

        self.do_lower_case = do_lower_case
        
        BPEConfig = namedtuple('BPEConfig', 'vncorenlp bpe_codes vocab')

        self.pho_config = BPEConfig(vncorenlp=vncorenlp_path, bpe_codes=bpe_codes_path, vocab=vocab_file_path)
        self.rdrsegmenter = VnCoreNLP(self.pho_config.vncorenlp, annotators="wseg", max_heap_size='-Xmx1g')
        self.bpe = fastBPE(self.pho_config)
        self.vocab = Dictionary()
        self.vocab.add_from_file(self.pho_config.vocab)
    def __init__(self, model_path):

        #Load the pretrained PhoBERT Model
        print("Loading Classification...")
        self.config = RobertaConfig.from_pretrained(
            model_path + 'PhoBERT/config.json',
            from_tf=False,
            num_labels=5,
            output_hidden_states=False,
        )
        self.phoBERT_cls = RobertaForSequenceClassification.from_pretrained(
            model_path + 'PhoBERT/model.bin', config=self.config)
        device = "cuda:0"
        self.phoBERT_cls = self.phoBERT_cls.to(device)
        self.phoBERT_cls.eval()
        print("Loading pre-trained model...")
        self.phoBERT_cls.load_state_dict(
            torch.load(
                model_path +
                'roberta_state_dict_9bfb8319-01b2-4301-aa5a-756d390a98e1.pth'))
        print("Finished loading PhoBERT Classification model.")

        #Load the BPE and Vocabulary Dictionary
        print("Loading BPE and vocab dict ...")

        class BPE():
            bpe_codes = model_path + 'PhoBERT/bpe.codes'

        args = BPE()
        self.bpe = fastBPE(args)
        self.vocab = Dictionary()
        self.vocab.add_from_file(model_path + "PhoBERT/dict.txt")
        print("Finished loading BPE and vocab dict.")

        #Load the Text Recognizer
        config = Cfg.load_config_from_name('vgg_transformer')
        config['weights'] = 'weights/transformerocr.pth'
        config['cnn']['pretrained'] = False
        config['device'] = 'cuda:0'
        config['predictor']['beamsearch'] = False
        self.text_recognizer = Predictor(config)
Esempio n. 25
0
                    default='./PhoBERT_large_transformers/model.bin')
parser.add_argument('--max_sequence_length', type=int, default=256)
parser.add_argument('--batch_size', type=int, default=64)
parser.add_argument('--accumulation_steps', type=int, default=5)
parser.add_argument('--epochs', type=int, default=5)
parser.add_argument('--fold', type=int, default=0)
parser.add_argument('--seed', type=int, default=69)
parser.add_argument('--lr', type=float, default=3e-5)
parser.add_argument('--ckpt_path', type=str, default='./models_zalo')
parser.add_argument('--bpe-codes',
                    default="./PhoBERT_large_transformers/bpe.codes",
                    type=str,
                    help='path to fastBPE BPE')

args = parser.parse_args()
bpe = fastBPE(args)
rdrsegmenter = VnCoreNLP(args.rdrsegmenter_path,
                         annotators="wseg",
                         max_heap_size='-Xmx500m')

seed_everything(69)

# Load model
config = RobertaConfig.from_pretrained(args.config_path,
                                       output_hidden_states=True,
                                       num_labels=6)

model_bert = RobertaForAIViVN.from_pretrained(args.pretrained_path,
                                              config=config)
model_bert.cuda()
from fairseq.models.roberta import RobertaModel
phobert = RobertaModel.from_pretrained('PhoBERT_base_fairseq',
                                       checkpoint_file='model.pt')
phobert.eval()  # disable dropout (or leave in train mode to finetune)

# Incorporate the BPE encoder into PhoBERT-base
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq import options

parser = options.get_preprocessing_parser()
parser.add_argument('--bpe-codes',
                    type=str,
                    help='path to fastBPE BPE',
                    default="PhoBERT_base_fairseq/bpe.codes")
args = parser.parse_args()
phobert.bpe = fastBPE(args)


def align_bpe_to_words(roberta, bpe_tokens: torch.LongTensor,
                       other_tokens: List[str]):
    """
    Helper to align GPT-2 BPE to other tokenization formats (e.g., spaCy).
    Args:
        roberta (RobertaHubInterface): RoBERTa instance
        bpe_tokens (torch.LongTensor): GPT-2 BPE tokens of shape `(T_bpe)`
        other_tokens (List[str]): other tokens of shape `(T_words)`
    Returns:
        List[str]: mapping from *other_tokens* to corresponding *bpe_tokens*.
    """
    assert bpe_tokens.dim() == 1
    assert bpe_tokens[0] == 0
Esempio n. 27
0
from joblib import Parallel, delayed
import warnings
warnings.filterwarnings("ignore")

LEARNING_RATE = 6e-5
MAX_LEN = 126
TRAIN_BATCH_SIZE = 35
VALID_BATCH_SIZE = 32
EPOCHS = 4
INPUT_PATH = "/content/drive/My Drive/Tweet Sentiment Extraction/input/"
OUTPUT_PATH = ""
TRAINING_FILE = f""
ROBERTA_PATH = f""

key = argparse.Namespace(bpe_codes= f"BERTweet_base_transformers/bpe.codes")
bpe = fastBPE(key)

# Load the dictionary  
vocab = Dictionary()
vocab.add_from_file("BERTweet_base_transformers/dict.txt")

# TOKENIZER = transformers.RobertaTokenizer(
#     vocab_file =  f'{ROBERTA_PATH}vocab.json',
#     merges_file = f'{ROBERTA_PATH}merges.txt',
#     lowercase = True,
#     add_prefix_space = True
# )

class AverageMeter:
    """
    Computes and stores the average and current value
Esempio n. 28
0
 def __init__(self, bpe_path, vocab_path):
     self._bpe = fastBPE(Config(bpe_codes=bpe_path))
     self._vocab = self._get_vocab(vocab_path)
Esempio n. 29
0
import torch
from fairseq.models.roberta import RobertaModel
from fairseq.data.encoders.fastbpe import fastBPE

from CONFIG import *
phoBERT = RobertaModel.from_pretrained('PhoBERT_base_fairseq',
                                       checkpoint_file='model.pt')
phoBERT.eval()  # disable dropout (or leave in train mode to finetune)


class BPE():
    bpe_codes = 'PhoBERT_base_fairseq/bpe.codes'


args = BPE()
phoBERT.bpe = fastBPE(args)  #Incorporate the BPE encoder into PhoBERT


def embedding_document(document):
    doc = ViTokenizer.tokenize(document)
    tokens = phoBERT.encode(doc)
    if len(tokens) > 256:
        chunks = math.ceil(len(tokens) / 256)
        emb = []
        sum_tokens = len(tokens)
        chunks = min(chunks, 3)
        for i in range(chunks):
            sum_tokens = sum_tokens - 256
            if sum_tokens > 0:
                emb.append(
                    phoBERT.extract_features(tokens[i * 256:(i + 1) *
Esempio n. 30
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        choices=["vlsp_2018_single", \
                                "vlsp_2018_NLI_M", "vlsp_2018_QA_M", "vlsp_2018_NLI_B", "vlsp_2018_QA_B"],
                        help="The name of the task to train.")
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--vocab_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--bert_config_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The config json file corresponding to the pre-trained BERT model. \n"
                             "This specifies the model architecture.")
    parser.add_argument('--bpe-codes', 
                        default=None,
                        required=True,
                        type=str,  
                        help='path to fastBPE BPE')
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model checkpoints will be written.")
    parser.add_argument("--init_checkpoint",
                        default=None,
                        type=str,
                        required=True,
                        help="Initial checkpoint (usually from a pre-trained BERT model).")
    
    ## Other parameters
    parser.add_argument("--do_save_model",
                        default=False,
                        action='store_true',
                        help="Whether to save checkpoint.")
    parser.add_argument("--eval_test",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the test set.")                    
    parser.add_argument("--do_lower_case",
                        default=False,
                        action='store_true',
                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--accumulate_gradients",
                        type=int,
                        default=1,
                        help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed', 
                        type=int, 
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumualte before performing a backward/update pass.")                       
    args = parser.parse_args()


    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format(
                            args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # prepare dataloaders
    processors = {
        "vlsp_2018_single":VLSP_2018_single_Processor,
        "vlsp_2018_NLI_M":VLSP_2018_NLI_M_Processor,
        "vlsp_2018_QA_M":VLSP_2018_QA_M_Processor,
        "vlsp_2018_NLI_B":VLSP_2018_NLI_B_Processor,
        "vlsp_2018_QA_B":VLSP_2018_QA_B_Processor,
    }

    processor = processors[args.task_name]()
    label_list = processor.get_labels()

    bert_config = RobertaConfig.from_pretrained(args.bert_config_file)
    bert_config.num_labels = len(label_list)
    
    
    label2id = {}
    id2label = {}
    for (i, label) in enumerate(label_list):
        label2id[label] = i
        id2label[str(i)] = label

    bert_config.label2id = label2id
    bert_config.id2label = id2label

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format(
            args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    bpe = fastBPE(args)
    vocab = Dictionary()
    vocab.add_from_file(args.vocab_file)

    # training set
    train_examples = None
    num_train_steps = None
    train_examples = processor.get_train_examples(args.data_dir)
    num_train_steps = int(
        len(train_examples) / args.train_batch_size * args.num_train_epochs)

    train_features = convert_examples_to_features(
        train_examples, label_list, args.max_seq_length, bpe, vocab)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    if args.local_rank == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)


    # dev set
    dev_examples = processor.get_dev_examples(args.data_dir)
    dev_features = convert_examples_to_features(
        dev_examples, label_list, args.max_seq_length, bpe, vocab)
    
    all_dev_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long)
    all_dev_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long)
    all_dev_segment_ids = torch.tensor([f.segment_ids for f in dev_features], dtype=torch.long)
    all_dev_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long)

    dev_data = TensorDataset(all_dev_input_ids, all_dev_input_mask, all_dev_segment_ids, all_dev_label_ids)
    dev_dataloader = DataLoader(dev_data, batch_size=args.eval_batch_size, shuffle=False)

    # test set
    if args.eval_test:
        test_examples = processor.get_test_examples(args.data_dir)
        test_features = convert_examples_to_features(
            test_examples, label_list, args.max_seq_length, bpe, vocab)

        all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long)

        test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        test_dataloader = DataLoader(test_data, batch_size=args.eval_batch_size, shuffle=False)


    # model and optimizer
    model = RobertaForSequenceClassification(bert_config)

    if args.init_checkpoint is not None:
        model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    no_decay = ['bias', 'gamma', 'beta']
    optimizer_parameters = [
         {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
         {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
         ]
	
    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)


    # train
    output_log_file = os.path.join(args.output_dir, "log.txt")
    print("output_log_file=",output_log_file)
    with open(output_log_file, "w") as writer:
        if args.eval_test:
            writer.write("epoch\tglobal_step\tloss\tdev_loss\tdev_accuracy\ttest_loss\ttest_accuracy\n")
        else:
            writer.write("epoch\tglobal_step\tloss\n")
    
    global_step = 0
    epoch=0
    for _ in trange(int(args.num_train_epochs), desc="Epoch"):
        epoch+=1
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            #RoBERTa not use token_type_ids
            loss, logits = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids)
            if n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()
            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()    # We have accumulated enought gradients
                model.zero_grad()
                global_step += 1
        
        if(args.do_save_model):
            if(n_gpu > 1):
                torch.save(model.module.state_dict(), os.path.join(args.output_dir, 'model_ep' + str(epoch) + '.bin'))
            else:
                torch.save(model.state_dict(), os.path.join(args.output_dir, 'model_ep' + str(epoch) + '.bin'))

        #dev eval
        model.eval()
        dev_loss, dev_accuracy = 0, 0
        nb_dev_steps, nb_dev_examples = 0, 0
        with open(os.path.join(args.output_dir, "dev_ep_"+str(epoch)+".txt"),"w") as f_dev:
            for input_ids, input_mask, segment_ids, label_ids in dev_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    tmp_dev_test_loss, logits = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids)

                logits = F.softmax(logits, dim=-1)
                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                outputs = np.argmax(logits, axis=1)
                for output_i in range(len(outputs)):
                    f_dev.write(str(outputs[output_i]))
                    for ou in logits[output_i]:
                        f_dev.write(" "+str(ou))
                    f_dev.write("\n")
                tmp_dev_accuracy=np.sum(outputs == label_ids)

                dev_loss += tmp_dev_test_loss.mean().item()
                dev_accuracy += tmp_dev_accuracy

                nb_dev_examples += input_ids.size(0)
                nb_dev_steps += 1

        dev_loss = dev_loss / nb_dev_steps
        dev_accuracy = dev_accuracy / nb_dev_examples

        # eval_test
        if args.eval_test:
            model.eval()
            test_loss, test_accuracy = 0, 0
            nb_test_steps, nb_test_examples = 0, 0
            with open(os.path.join(args.output_dir, "test_ep_"+str(epoch)+".txt"),"w") as f_test:
                for input_ids, input_mask, segment_ids, label_ids in test_dataloader:
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        tmp_test_loss, logits = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids)

                    logits = F.softmax(logits, dim=-1)
                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    outputs = np.argmax(logits, axis=1)
                    for output_i in range(len(outputs)):
                        f_test.write(str(outputs[output_i]))
                        for ou in logits[output_i]:
                            f_test.write(" "+str(ou))
                        f_test.write("\n")
                    tmp_test_accuracy=np.sum(outputs == label_ids)

                    test_loss += tmp_test_loss.mean().item()
                    test_accuracy += tmp_test_accuracy

                    nb_test_examples += input_ids.size(0)
                    nb_test_steps += 1

            test_loss = test_loss / nb_test_steps
            test_accuracy = test_accuracy / nb_test_examples


        result = collections.OrderedDict()
        if args.eval_test:
            result = {'epoch': epoch,
                    'global_step': global_step,
                    'loss': tr_loss/nb_tr_steps,
                    'dev_loss': dev_loss,
                    'dev_accuracy': dev_accuracy,
                    'test_loss': test_loss,
                    'test_accuracy': test_accuracy}
        else:
            result = {'epoch': epoch,
                    'global_step': global_step,
                    'loss': tr_loss/nb_tr_steps}

        logger.info("***** Eval results *****")
        with open(output_log_file, "a+") as writer:
            for key in result.keys():
                logger.info("  %s = %s\n", key, str(result[key]))
                writer.write("%s\t" % (str(result[key])))
            writer.write("\n")