Esempio n. 1
0
  def build_model(cls, args, task):

    model_fast = RobertaModel.build_model(args, task)
    model_slow = RobertaModel.build_model(args, task)

    if args.roberta_model_path != "":
      state = checkpoint_utils.load_checkpoint_to_cpu(args.roberta_model_path)
      model_fast.load_state_dict(state["model"], strict=True, args=args)
      model_slow.load_state_dict(state["model"], strict=True, args=args)
    else:
      model_slow.load_state_dict(model_fast.state_dict(), strict=True, args=args)

    proj = None
    if args.use_proj:
      # NOTE alway be share_proj
      langs = ["share_lang"]
      proj = build_projection_dict(langs, args.encoder_embed_dim, args.activation_fn, args.fp16)

    if "xlco_queue_size" in args:
      xlco_queue_size = args.xlco_queue_size
    else: xlco_queue_size = 1
    print("xlco_queue_size is set as %d" % xlco_queue_size, flush=True)
    queue = torch.randn(xlco_queue_size, args.encoder_embed_dim)

    return cls(model_fast, model_slow, queue, proj=proj)
Esempio n. 2
0
 def add_args(parser):
     RobertaModel.add_args(parser)
     parser.add_argument(
         "--no-final-layer-norm",
         action="store_true",
         help=("don't add final layernorm (only applicable when "
               "--encoder-normalize-before=True"),
     )
Esempio n. 3
0
    def __init__(self):
        if not os.path.exists(AGGREGATOR_DIR):
            os.makedirs(AGGREGATOR_DIR)
        if not os.path.isfile(AGGREGATOR_2015_2016):
            print("Downloading aggregators from s3...")
            wget.download(AGGREGATOR_2015_2016_URL,
                          AGGREGATOR_2015_2016,
                          bar=self._download_progress_bar)
        if not os.path.isfile(AGGREGATOR_2015_2017):
            print("Downloading aggregators from s3...")
            wget.download(AGGREGATOR_2015_2017_URL,
                          AGGREGATOR_2015_2017,
                          bar=self._download_progress_bar)
        if not os.path.isfile(AGGREGATOR_2015_2016_8_dim):
            print("Downloading aggregators from s3...")
            wget.download(AGGREGATOR_2015_2016_8_dim_URL,
                          AGGREGATOR_2015_2016_8_dim,
                          bar=self._download_progress_bar)
        if not os.path.isfile(AGGREGATOR_2015_2017_8_dim):
            print("Downloading aggregators from s3...")
            wget.download(AGGREGATOR_2015_2017_8_dim_URL,
                          AGGREGATOR_2015_2017_8_dim,
                          bar=self._download_progress_bar)
        if not os.path.isfile(ROBERTA_STS_PATH + '/checkpoint_best.pt'):
            print("Downloading ROBERTA STS model from s3...")
            wget.download(ROBERTA_STS_URL,
                          ROBERTA_STS_PATH + '/checkpoint_best.pt',
                          bar=self._download_progress_bar)
        if not os.path.isfile(ROBERTA_MNLI_PATH + '/model_mnli.pt'):
            print("Downloading ROBERTA MNLI model from s3...")
            wget.download(ROBERTA_MNLI_URL,
                          ROBERTA_MNLI_PATH + '/model_mnli.pt',
                          bar=self._download_progress_bar)

        self.roberta_STS = RobertaModel.from_pretrained(
            checkpoint_file='checkpoint_best.pt',
            model_name_or_path=ROBERTA_STS_PATH)
        self.roberta_STS.eval()

        self.roberta_MNLI = RobertaModel.from_pretrained(
            checkpoint_file='model_mnli.pt',
            model_name_or_path=ROBERTA_MNLI_PATH)
        self.roberta_MNLI.eval()
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.gpt_model = GPT2LMHeadModel.from_pretrained('gpt2')
        self.agg_one = load(AGGREGATOR_2015_2016)
        self.agg_two = load(AGGREGATOR_2015_2017)
        self.agg_one_8_dim = load(AGGREGATOR_2015_2016_8_dim)
        self.agg_two_8_dim = load(AGGREGATOR_2015_2017_8_dim)
def loadRobertaCheckpoint(pathBERTCheckpoint, pathData, from_pretrained=False):
    """
    Load Roberta model from checkpoint.
    If load a pretrained model from fairseq, set from_pretrained=True.
    """
    if from_pretrained:  # Require connection to download bpe, possible errors for trained checkpoint that contains cfg
        roberta = RobertaModel.from_pretrained(dirname(pathBERTCheckpoint),
                                               basename(pathBERTCheckpoint),
                                               pathData)
    else:
        # Set up the args Namespace
        model_args = argparse.Namespace(task='masked_lm',
                                        seed=-1,
                                        output_dictionary_size=-1,
                                        data=pathData,
                                        path=pathBERTCheckpoint)

        # Setup task
        task = tasks.setup_task(model_args)

        # Load model
        models, _model_args = checkpoint_utils.load_model_ensemble(
            [model_args.path], task=task)
        model = models[0]

        # Wrap-up to RobertaHubInterface (to be consistent with RobertaModel.from_pretrained)
        roberta = RobertaHubInterface(_model_args, task, model)

    return roberta
Esempio n. 5
0
 def __init__(self, device, model_path='res/roberta.large'):
     super().__init__()
     self.device = device
     with torch.no_grad():
         self.roberta = RobertaModel.from_pretrained(
             model_path, checkpoint_file='model.pt')
         self.roberta.eval()
Esempio n. 6
0
def Roberta_feature_extraction(ids,texts,feature_file_name):
    roberta = RobertaModel.from_pretrained('roberta.large',checkpoint_file = 'model.pt')
    roberta.eval()

    feature_dict={}
    for i in range(len(ids)):
        id = ids[i]
        print(id)
        title = texts[i]
        tokens = roberta.encode(title)
        #assert tokens.tolist() == [0, 31414, 232, 328, 2]
        print(tokens.tolist())
        roberta.decode(tokens)  # 'Hello world!'

        # Extract the last layer's features
        last_layer_features = roberta.extract_features(tokens)
        #assert last_layer_features.size() == torch.Size([1, 5, 1024])
        print(torch.mean(last_layer_features,1,True))
        #print(last_layer_features.detach().numpy().shape())
        print(len(torch.mean(last_layer_features,1,True).detach().numpy().tolist()[0][0]))
        #print(np.mean(last_layer_features.detach().numpy(), axis=0).tolist()[0])
        print(torch.mean(last_layer_features, 1, True).detach().numpy().tolist()[0][0])

        feature_dict[tumblr_id]=torch.mean(last_layer_features, 1, True).detach().numpy().tolist()[0][0]
    np.save(feature_file_name, feature_dict)
    def __init__(self, args, encoder):
        super().__init__(encoder)
        self.args = args

        # We follow BERT's random weight initialization
        self.apply(init_bert_params)

        self.classification_heads = nn.ModuleDict()

        ############################## Adding the pretrained SSL models to extract features###############

        if self.args.a_only or self.args.all_in:

            self.roberta_vqwav2vec = RobertaModel.from_pretrained(
                '/hpc/gsir059/INTERSPEECH/MOSI-SEMI/trained_ssl/wav2vec/vq-wav2vec-Kmeans-Roberta',
                checkpoint_file='bert_kmeans.pt')

            if self.args.frozen_ssl:
                for param in self.roberta_vqwav2vec.parameters():
                    param.requires_grad = False

        if self.args.t_only or self.args.all_in:
            roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')

            ########################### Freezing pretrained SSL paramtere###################################
            self.model_text2vec = roberta
            if self.args.frozen_ssl:

                for param in self.model_text2vec.parameters():
                    param.requires_grad = False
Esempio n. 8
0
 def __init__(self):
     self.model = RobertaModel.from_pretrained(
         "/data/models/roberta.large",
         checkpoint_file="model.pt",
     )
     self.model.to("cpu")
     self.model.eval()
Esempio n. 9
0
    def from_pretrained(cls,
                        hparams: HyperOptArgumentParser,
                        lm_head: bool = False):
        if not os.path.exists("pretrained/"):
            os.mkdir("pretrained/")

        pretrained_model = hparams.pretrained_model
        if pretrained_model == "roberta.base":
            download_file_maybe_extract(
                ROBERTA_BASE_URL,
                directory="pretrained",
                check_files=[ROBERTA_BASE_MODEL_NAME],
            )

        elif pretrained_model == "roberta.large":
            download_file_maybe_extract(
                ROBERTA_LARGE_URL,
                directory="pretrained",
                check_files=[ROBERTA_LARGE_MODEL_NAME],
            )
        else:
            raise Exception(f"{pretrained_model} is an invalid RoBERTa model.")

        roberta = RobertaModel.from_pretrained("pretrained/" +
                                               pretrained_model,
                                               checkpoint_file="model.pt")
        roberta.eval()
        tokenizer = RoBERTaTextEncoder(
            roberta.encode, roberta.task.source_dictionary.__dict__["indices"])
        return RoBERTa(roberta=roberta,
                       tokenizer=tokenizer,
                       hparams=hparams,
                       lm_head=lm_head)
Esempio n. 10
0
def predict():
    parser = argparse.ArgumentParser()
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="")
    parser.add_argument("--task",
                        default=None,
                        type=str,
                        required=True,
                        help="")
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="")
    args = parser.parse_args()
    # print(args)

    roberta = RobertaModel.from_pretrained(
        args.output_dir,
        # './outputs/RTE/7/',
        checkpoint_file='checkpoint_best.pt',
        data_name_or_path=args.data_dir)

    label_fn = lambda label: roberta.task.label_dictionary.string(
        [label + roberta.task.target_dictionary.nspecial])
    # print(label_fn)
    ncorrect, nsamples = 0, 0
    roberta.cuda()
    roberta.eval()
    with open('../data-superglue-csv/' + args.task + '/test.tsv') as fin:
        fin.readline()
        preds = []
        for index, line in enumerate(fin):
            tokens = line.strip().split('\t')
            # print(tokens)
            sent1, sent2 = tokens[0], tokens[1]
            # print(sent1,"\n", sent2)
            tokens = roberta.encode(sent1, sent2)
            # print(tokens)
            if len(tokens) > 512:
                # print(len(tokens))
                # print(tokens)
                tokens = torch.cat((tokens[0].reshape(1), tokens[-511:]), 0)
                # print(tokens)
            logits = roberta.predict('sentence_classification_head', tokens)
            prediction = F.log_softmax(logits, dim=-1).argmax().item()
            # print(prediction)
            prediction_label = label_fn(prediction)
            # print(prediction_label)
            preds.append(prediction_label)
    print(preds)
    with open(args.output_dir + 'pred_results', "w") as writer:
        # print(label_list)
        for i in range(len(preds)):
            # json_i= "\"idx: %d, \"label\": \"label_i\""
            writer.write("{\"idx\": %d, \"label\": \"%s\"}\n" % (i, preds[i]))
Esempio n. 11
0
def sentence_predict(task, ckpdir, ckpname, savedir, datadir=None):
    if datadir is None:
        datadir = 'data/{}-bin/'.format(task)
        if task == "AX":
            datadir = 'data/MNLI-bin/'
    roberta = RobertaModel.from_pretrained(ckpdir, ckpname, datadir)
    roberta.cuda()
    roberta.eval()
    label_fn = lambda label: roberta.task.label_dictionary.string(
        [label + roberta.task.target_dictionary.nspecial])

    tasks = [task]
    testfiles = [
        os.path.join(datadir, '../glue_data/{}/test.tsv'.format(task))
    ]
    if task == "AX":
        testfiles = [
            os.path.join(datadir, '../glue_data/diagnostic/diagnostic.tsv')
        ]
    elif task == "MNLI":
        tasks = ["MNLI-m", "MNLI-mm"]
        testfiles = [
            os.path.join(datadir, '../glue_data/MNLI/test_matched.tsv'),
            os.path.join(datadir, '../glue_data/MNLI/test_mismatched.tsv')
        ]

    for task, testfile in zip(tasks, testfiles):
        with open(os.path.join(savedir, '{}.tsv'.format(task)),
                  'wt') as out_file:
            tsv_writer = csv.writer(out_file, delimiter='\t')
            tsv_writer.writerow(['index', 'prediction'])
            with open(testfile) as fin:
                fin.readline()
                for index, line in tqdm(enumerate(fin)):
                    tokens = line.strip().split('\t')
                    if task in ['CoLA', 'SST-2']:
                        tokens = roberta.encode(tokens[1])
                    elif task == "MRPC":
                        tokens = roberta.encode(tokens[3], tokens[4])
                    elif task == "STS-B":
                        tokens = roberta.encode(tokens[7], tokens[8])
                    elif task in ["MNLI-m", "MNLI-mm"]:
                        tokens = roberta.encode(tokens[8], tokens[9])
                    elif task in ["RTE", "QNLI", "QQP", "AX"]:
                        tokens = roberta.encode(tokens[1], tokens[2])
                    if task == "STS-B":
                        prediction_label = roberta.predict(
                            'sentence_classification_head',
                            tokens,
                            return_logits=True).item()
                        prediction_label = min(1.0, max(0.0, prediction_label))
                    else:
                        prediction = roberta.predict(
                            'sentence_classification_head',
                            tokens).argmax().item()
                        if 'MNLI' in task:
                            prediction = 2 - prediction
                        prediction_label = label_fn(prediction)
                    tsv_writer.writerow([index, prediction_label])
def evaluate(words: List[str],
             path: Path = None,
             model: RobertaModel = None,
             print_step: int = 1000):
    if not model:
        model = RobertaModel.from_pretrained('../models/robbert',
                                             checkpoint_file='model.pt')
        model.eval()

    wordlistfiller = WordListFiller(words, model=model)

    dataset_path = path if path is not None else models_path / (
        "-".join(words) + ".tsv")

    correct = 0
    total = 0
    errors = 0

    with open(dataset_path) as input_file:
        for line in input_file:
            sentence, index = line.split('\t')
            expected = words[int(index.strip())]

            try:
                predicted = wordlistfiller.find_optimal_word(sentence)
                if predicted is None:
                    errors += 1
                elif predicted == expected:
                    correct += 1
                total += 1

                if total % print_step == 0:
                    print("{0:.2f}%".format(100 * correct / total),
                          correct,
                          total,
                          str(errors) + " errors",
                          expected,
                          predicted,
                          sentence,
                          sep=' / ')
            except Exception:
                print("Error with", line)
                errors += 1
                total += 1

    return correct, total, errors
Esempio n. 13
0
 def __init__(self,
              model_dir=MODEL_DIR,
              ckpt_file=CHECKPOINT_FILE,
              use_gpu=False):
     self.model = RobertaModel.from_pretrained(model_dir,
                                               checkpoint_file=ckpt_file)
     self.model.eval()  # disable dropout
     if use_gpu: self.model.cuda()
Esempio n. 14
0
    def __init__(self):
        super().__init__(embedding_dim=768)

        self.roberta = RobertaModel.from_pretrained(
            "/Users/mark/Documents/Datasets/Pretrained_models/RoBERTa/roberta.base",
            checkpoint_file="model.pt",
        )
        self.fitted: bool = False
Esempio n. 15
0
    def __init__(self, cfg: Wav2BertConfig, w2v_encoder: BaseFairseqModel):
        super().__init__()
        self.cfg = cfg
        self.w2v_encoder = w2v_encoder

        from fairseq.models.roberta import RobertaModel
        if os.path.isfile(os.path.join(cfg.bert_path, 'model.pt')):
            print('loading bert from cfg path')
            bert = RobertaModel.from_pretrained(cfg.bert_path, checkpoint_file='model.pt')
        else:
            print('loading bert from relative path')
            bert = RobertaModel.from_pretrained('models/roberta.base', checkpoint_file='model.pt')

        self.bert_layers = bert.model.encoder.sentence_encoder.layers

        
        self.proj = Linear(cfg.encoder_embed_dim, len(bert.task.target_dictionary))
Esempio n. 16
0
    def __init__(self, opt, bert_config=None):
        super(SANBertNetwork, self).__init__()
        self.dropout_list = nn.ModuleList()
        self.encoder_type = opt['encoder_type']
        if opt['encoder_type'] == EncoderModelType.ROBERTA:
            from fairseq.models.roberta import RobertaModel
            self.bert = RobertaModel.from_pretrained(opt['init_checkpoint'])
            hidden_size = self.bert.args.encoder_embed_dim
            self.pooler = LinearPooler(hidden_size)
        else:
            self.bert_config = BertConfig.from_dict(opt)
            self.bert = BertModel(self.bert_config)
            hidden_size = self.bert_config.hidden_size

        if opt.get('dump_feature', False):
            self.opt = opt
            return
        if opt['update_bert_opt'] > 0:
            for p in self.bert.parameters():
                p.requires_grad = False
        self.decoder_opt = opt['answer_opt']
        self.task_types = opt["task_types"]
        self.scoring_list = nn.ModuleList()
        labels = [int(ls) for ls in opt['label_size'].split(',')]
        task_dropout_p = opt['tasks_dropout_p']

        for task, lab in enumerate(labels):
            decoder_opt = self.decoder_opt[task]
            task_type = self.task_types[task]
            dropout = DropoutWrapper(task_dropout_p[task], opt['vb_dropout'])
            self.dropout_list.append(dropout)
            if task_type == TaskType.Span:
                assert decoder_opt != 1
                out_proj = nn.Linear(hidden_size, 2)
            elif task_type == TaskType.SeqenceLabeling:
                out_proj = nn.Linear(hidden_size, lab)
            elif task_type == TaskType.MaskLM:
                if opt['encoder_type'] == EncoderModelType.ROBERTA:
                    # TODO: xiaodl
                    out_proj = MaskLmHeader(
                        self.bert.embeddings.word_embeddings.weight)
                else:
                    out_proj = MaskLmHeader(
                        self.bert.embeddings.word_embeddings.weight)
            else:
                if decoder_opt == 1:
                    out_proj = SANClassifier(hidden_size,
                                             hidden_size,
                                             lab,
                                             opt,
                                             prefix='answer',
                                             dropout=dropout)
                else:
                    out_proj = nn.Linear(hidden_size, lab)
            self.scoring_list.append(out_proj)

        self.opt = opt
        self._my_init()
Esempio n. 17
0
    def __init__(self,configs):
        super(RobertaACSA, self).__init__()

        self.configs=configs
        self.roberta=RobertaModel.from_pretrained('pretrained/roberta.large', checkpoint_file='model.pt')

        self.linear_hidden=torch.nn.Linear(configs.ROBERTA_DIM,configs.LINEAR_HIDDEN_DIM)
        self.linear_output=torch.nn.Linear(configs.LINEAR_HIDDEN_DIM,3)

        self.dropout_output=torch.nn.Dropout(0.1)
Esempio n. 18
0
    def add_args(parser):
        RobertaModel.add_args(parser)

        # add args for Linformer
        parser.add_argument('--compressed',
                            type=int,
                            help='compressed ratio of sequence length')
        parser.add_argument(
            '--shared-kv-compressed',
            type=int,
            help='share compressed matrix between k and v, in each layer')
        parser.add_argument(
            '--shared-layer-kv-compressed',
            type=int,
            help='share compressed matrix between k and v and across all layers'
        )
        parser.add_argument('--freeze-compress',
                            type=int,
                            help='freeze the parameters in compressed layer')
Esempio n. 19
0
 def __init__(self):
     self.model = RobertaModel.from_pretrained(
         "/data/models/icebert-base-36k",
         checkpoint_file="model.pt",
         bpe="gpt2",
         gpt2_encoder_json="/data/models/icebert-base-36k/icebert-bpe-vocab.json",
         gpt2_vocab_bpe="/data/models/icebert-base-36k/icebert-bpe-merges.txt",
     )
     self.model.to("cpu")
     self.model.eval()
Esempio n. 20
0
def load_roberta(name=None, roberta_cache_path=None, roberta_use_gpu=False):
    if not roberta_cache_path:
        # Load the Roberta Model from torch hub
        roberta = torch.hub.load('pytorch/fairseq', name)
    else:
        roberta = RobertaModel.from_pretrained(roberta_cache_path,
                                               checkpoint_file='model.pt')
    roberta.eval()
    if roberta_use_gpu:
        roberta.cuda()
    return roberta
Esempio n. 21
0
def predict():
    parser = argparse.ArgumentParser()
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="")
    parser.add_argument("--task",
                        default=None,
                        type=str,
                        required=True,
                        help="")
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="")
    args = parser.parse_args()
    # print(args)

    roberta = RobertaModel.from_pretrained(
        args.output_dir,
        # './outputs/RTE/7/',
        checkpoint_file='checkpoint_best.pt',
        data_name_or_path=args.data_dir)

    label_fn = lambda label: roberta.task.label_dictionary.string(
        [label + roberta.task.target_dictionary.nspecial])
    # print(label_fn)
    ncorrect, nsamples = 0, 0
    roberta.cuda()
    roberta.eval()
    with open('../data-superglue-csv/' + args.task + '/val.tsv') as fin:
        fin.readline()
        logits = np.array([])
        num_classes = 2
        for index, line in enumerate(fin):
            tokens = line.strip().split('\t')
            sent1, sent2 = tokens[0], tokens[1]
            tokens = roberta.encode(sent1, sent2)
            logit = roberta.predict('sentence_classification_head',
                                    tokens).item()
            logits = np.append(logits, logit)
        print(logit)
        logits = logits.reshape((-1, num_classes))
        preds = np.argmax(logits, -1)

    print(preds)

    with open(args.output_dir + 'eval_results2', "w") as writer:
        # print(label_list)
        for i in range(len(preds)):
            # json_i= "\"idx: %d, \"label\": \"label_i\""
            writer.write("{\"idx\": %d, \"label\": \"%s\"}\n" % (i, preds[i]))
Esempio n. 22
0
 def __init__(self, model_dir, model_name, device):
     self.model = RobertaModel.from_pretrained(model_dir, checkpoint_file=model_name)
     self.model.to(device=device)
     self.device = device
     self.bpe = self.model.bpe
     self.task = self.model.task
     self.max_sentence_length = 256
     self.cosine_similarity = torch.nn.CosineSimilarity(dim=0)
     self.mask = "<mask>" 
     self.start_sentence = "<s>"
     self.period = '.'
Esempio n. 23
0
    def load_roberta(name=None, roberta_cache_path=None):
        if not roberta_cache_path:
            roberta = torch.hub.load('pytorch/fairseq', name)
        else:
            roberta = RobertaModel.from_pretrained(roberta_cach_path,
                                                   checkpoint_file='model.pt')

        roberta.eval()
        if torch.cuda.is_available():
            roberta.cuda()
        return roberta
Esempio n. 24
0
def mnli_dev(ckpdir, ckpname, datadir=None):
    task = 'MNLI'
    if datadir is None:
        datadir = 'data/{}-bin/'.format(task)
    roberta = RobertaModel.from_pretrained(ckpdir, ckpname, datadir)
    roberta.cuda()
    roberta.eval()
    label_fn = lambda label: roberta.task.label_dictionary.string(
        [label + roberta.task.target_dictionary.nspecial])
    str2label = lambda str: roberta.task.label_dictionary.encode_line(str)[
        0].item() - roberta.task.target_dictionary.nspecial

    tasks = [task]
    testfiles = [
        os.path.join(datadir, '../glue_data/{}/test.tsv'.format(task))
    ]
    tasks = ["MNLI-m", "MNLI-mm"]
    testfiles = [
        os.path.join(datadir, '../glue_data/MNLI/dev_matched.tsv'),
        os.path.join(datadir, '../glue_data/MNLI/dev_mismatched.tsv')
    ]

    for task, testfile in zip(tasks, testfiles):
        tv_loss = 0
        tv_low = 0
        tv_high = 0
        accuracy = 0
        print("Task: {}".format(task))
        with open(testfile) as fin:
            fin.readline()
            pbar = tqdm(enumerate(fin))
            for index, line in pbar:
                tokens = line.strip().split('\t')
                input_token = roberta.encode(tokens[8], tokens[9])
                log_softmax_out = roberta.predict(
                    'sentence_classification_head', input_token)
                prediction = 2 - log_softmax_out.argmax().item()

                labels = np.array(
                    [str2label(t.lower()) for t in tokens[10:15]])
                labels = np.array([sum(labels == l) for l in range(3)]) / 5
                assert sum(labels) == 1
                tv_loss += sum(
                    abs(l1 - math.exp(l2)) for l1, l2 in zip(
                        labels,
                        log_softmax_out.detach().cpu().numpy()[0]))
                tv_high += 2 - max(labels)
                tv_low += sum(abs(labels - 1 / 3))
                accuracy += prediction == labels.argmax()
                pbar.set_description(
                    "tv: {:.4f}/ {:.4f}-{:.4f}, accu: {:.4f} ".format(
                        tv_loss / (index + 1), tv_low / (index + 1),
                        tv_high / (index + 1), accuracy / (index + 1)))
Esempio n. 25
0
    def load_model(self, model_name):
        full_model_name = 'models/' + model_name + '.pt'
        if not os.path.exists(full_model_name):
            print(
                f"{model_name} model not found on models/ directory. Downloading from torch.hub ...."
            )
            pretrained = torch.hub.load('pytorch/fairseq', model_name)
            torch.save(pretrained.model, full_model_name)

        pretrained = RobertaModel.from_pretrained(model_name)
        pretrained.eval()
        self.model = pretrained
    def fit(self, sentences):
        if self.model is None:
            from fairseq.models.roberta import RobertaModel
            from fairseq.data.encoders.fastbpe import fastBPE

            self.model = RobertaModel.from_pretrained(
                'PhoBERT_base_fairseq', checkpoint_file='model.pt')
            self.model.eval()

            args = BPE()
            self.model.bpe = fastBPE(args)
        return self
Esempio n. 27
0
 def __init__(self, pretrain="auxiliary_data/PhoBERT_base_fairseq"):
     self.phoBERT = RobertaModel.from_pretrained(pretrain,
                                                 checkpoint_file='model.pt')
     self.phoBERT.eval()
     parser = options.get_preprocessing_parser()
     parser.add_argument('--bpe-codes',
                         type=str,
                         help='path to fastBPE BPE',
                         default=pretrain + "/bpe.codes")
     args, unknown = parser.parse_known_args()
     self.phoBERT.bpe = fastBPE(
         args)  #Incorporate the BPE encoder into PhoBERT
Esempio n. 28
0
 def __init__(self, args):
     super().__init__()
     roberta_model_dir = args.roberta_model_dir
     roberta_model_name = args.roberta_model_name
     roberta_vocab_name = args.roberta_vocab_name
     self.dict_file = "{}/{}".format(roberta_model_dir, roberta_vocab_name)
     self.model = RobertaModel.from_pretrained(
         roberta_model_dir, checkpoint_file=roberta_model_name)
     self.bpe = self.model.bpe
     self.task = self.model.task
     self._build_vocab()
     self._init_inverse_vocab()
Esempio n. 29
0
    def load_bert(self, bertpath):
        if not bertpath:
            return None

        print("LOADING BERT....")

        roberta = RobertaModel.from_pretrained(
            bertpath, checkpoint_file='bert_kmeans.pt')
        roberta = roberta.eval()
        if torch.cuda.is_available():
            print('moving ROBERTA to CUDA')
            roberta.cuda()
        return roberta
Esempio n. 30
0
def get_clsemb(model, dataset):
    ckp = sys.argv[3]
    datapath = sys.argv[4]
    p = sys.argv[5]
    head_name = sys.argv[6]
    roberta = RobertaModel.from_pretrained(
        ckp, checkpoint_file='checkpoint_best.pt', data_name_or_path=datapath)
    roberta.cuda()
    roberta.eval()
    label_fn = lambda label: roberta.task.label_dictionary.string(
        [label + roberta.task.label_dictionary.nspecial])
    #test_examples, test_ys = get_test_examples('ruletaker/test.input0.bpe', 'rawrule/d0/test.label')

    #for i in tqdm(batches):
    #    xs = test_examples[i:i+bs]
    #    ys = test_ys[i:i+bs]
    #    xbatch = collate_tokens([roberta.encode(test_examples[j]) for j in range(i, min(len(test_ys), i+bs))], pad_idx=1)
    #    pred = label_fn(roberta.predict('ruletaker_head', xbatch)).argmax(dim=1).cpu().data.numpy()
    #    correct_cnt += np.sum(np.array(ys) == np.array(pred))
    for split in ['train', 'dev', 'test']:
        examples, ys = get_examples(os.path.join(p, '%s.input0' % split),
                                    os.path.join(p, '%s.label' % split))
        print("loaded data from", os.path.join(p, '%s.input0' % split), "and",
              os.path.join(p, '%s.label' % split))
        correct_cnt = 0
        #bs = 8
        #batches = range(0, len(test_examples), bs)
        out = torch.zeros((len(ys), 25, 1024))
        preds = []
        with torch.no_grad():
            for i in tqdm(range(len(examples)), desc=split):
                x = examples[i]
                y = ys[i]
                tokens = roberta.encode(x)
                pred = label_fn(
                    roberta.predict(head_name, tokens).argmax().item())
                preds.append(pred)
                if pred == y: correct_cnt += 1
                features = roberta.extract_features(tokens,
                                                    return_all_hiddens=True)
                cls_embs = torch.zeros((len(features), 1024))
                for k in range(len(features)):
                    cls_embs[k] = features[k][0, 0, :]
                out[i] = cls_embs.cpu()
        print("acc is", correct_cnt / len(ys))
        torch.save(out,
                   'out/{0}_{1}_{2}_embs.pt'.format(model, dataset, split))
        torch.save(preds,
                   'out/{0}_{1}_{2}_preds.pt'.format(model, dataset, split))
        torch.save([int(xx) for xx in ys],
                   'out/{0}_{1}_{2}_labels.pt'.format(model, dataset, split))