Ejemplo n.º 1
0
    def __init__(self, config_path):
        config = configparser.ConfigParser()
        config.read(config_path)

        self.save_dir = Path(config.get("general", "save_dir"))
        if not self.save_dir.exists():
            self.save_dir.mkdir(parents=True)
        self.clf_th = config.getfloat("general", "clf_th")

        self.mlp_model_path = config.get("model", "mlp")
        assert Path(self.mlp_model_path).exists()

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        bert_config_path = config.get("bert", "config_path")
        assert Path(bert_config_path).exists()
        self.bert_config = LongformerConfig.from_json_file(bert_config_path)
        self.max_seq_length = self.bert_config.max_position_embeddings - 2
        self.bert_tokenizer = LongformerTokenizer.from_pretrained(
            'allenai/longformer-base-4096')
        # bert_tokenizer_path = config.get("bert", "tokenizer_path")
        # assert Path(bert_config_path).exists()
        # self.bert_tokenizer = LongformerTokenizer.from_pretrained(bert_tokenizer_path)
        bert_model_path = config.get("bert", "model_path")
        assert Path(bert_model_path).exists()
        self.bert_model = LongformerModel.from_pretrained(
            bert_model_path, config=self.bert_config)
        self.bert_model.to(self.device)
        self.bert_model.eval()

        gold_dir = Path(config.get("data", "gold_dir"))
        assert Path(gold_dir).exists()
        self.gold_dataset = ConllDataset(gold_dir)
        target_dir = Path(config.get("data", "target_dir"))
        assert Path(target_dir).exists()
        self.target_dataset = ConllDataset(target_dir)
Ejemplo n.º 2
0
            '1',  # 32GB gpu with fp32
            '--gradient_accumulation_steps',
            '4',
            #'--evaluate_during_training', # this is removed to reduce training time
            '--do_train',
        ])
    #train_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_train_patients.txt'
    #val_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_val_patients.txt'
    # these are small file for test
    train_fn = '/scratch/xl3119/capstone/data/sample/sample.txt'
    val_fn = '/scratch/xl3119/capstone/data/sample/sample.txt'
    training_args.train_datapath = train_fn
    training_args.val_datapath = val_fn

    ##################### use pretrianed longformer in transformer
    init_config = LongformerConfig.from_json_file(
        'config_files/longformer_base_4096/config.json')
    mimic_tokenizer = BertTokenizer.from_pretrained('mimic_tokenizer')
    word_embeddings = np.loadtxt(
        join('/scratch/xl3119/capstone/wd_emb', "word_embedding_matrix.txt"))
    longformer_model = LongformerForMaskedLM(init_config)
    longformer_model = use_embeddings_fasttext(longformer_model,
                                               word_embeddings)
    # longformer_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

    logger.info('Train and eval with Longformer pretrained ...')
    pretrain_and_evaluate(training_args, longformer_model, mimic_tokenizer, train_only=True, eval_only=False, model_path=None\
                          #,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path.

                         )