def setup_class(self):
     self.processor = Sst2Processor()
     # Note: these tests do not download automatically test datasets. Please download them manually and update your
     # environment variables accordingly
     self.examples = self.processor.get_train_examples(
         os.environ["SST2_PATH"])
     self.test_dir = Path(tempfile.mkdtemp())
 def setup_class(self):
     self.processor = Sst2Processor()
     self.test_dir = Path(tempfile.mkdtemp())
     sst2_url = 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8'
     contents = requests.get(sst2_url)
     (self.test_dir / 'SST-2.zip').open('wb').write(contents.content)
     with ZipFile(self.test_dir / 'SST-2.zip', 'r') as zipObj:
         zipObj.extractall(self.test_dir)
     self.examples = self.processor.get_train_examples(self.test_dir / 'SST-2')
     sentence_piece_url = 'https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model'
     contents = requests.get(sentence_piece_url)
     (self.test_dir / 'spiece.model').open('wb').write(contents.content)
 def setup_class(self):
     self.processor = Sst2Processor()
     self.test_dir = Path(tempfile.mkdtemp())
     sst2_url = 'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip'
     contents = requests.get(sst2_url)
     (self.test_dir / 'SST-2.zip').open('wb').write(contents.content)
     with ZipFile(self.test_dir / 'SST-2.zip', 'r') as zipObj:
         zipObj.extractall(self.test_dir)
     self.examples = self.processor.get_train_examples(self.test_dir /
                                                       'SST-2')
     sentence_piece_url = 'https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model'
     contents = requests.get(sentence_piece_url)
     (self.test_dir / 'spiece.model').open('wb').write(contents.content)
Ejemplo n.º 4
0
def get_sst2_data(tokenizer, data_dir, mode='train'):
    p = Sst2Processor()
    if mode == 'train':
        data = p.get_train_examples(data_dir=os.path.join(data_dir, 'sst-2'))
    elif mode == 'dev':
        data = p.get_dev_examples(data_dir=os.path.join(data_dir, 'sst-2'))
    elif mode == 'test':
        data = p.get_test_examples(data_dir=os.path.join(data_dir, 'sst-2'))
    input_sent, type_id, label = [], [], []
    for line in data:
        sent = ['[CLS]'] + tokenizer.tokenize(line.text_a)
        input_sent.append(sent)
        type_id.append([0 for _ in range(len(sent))])
        label.append(line.label)
    return input_sent, type_id, label
 def setup_class(self):
     self.processor = Sst2Processor()
     self.test_dir = Path(tempfile.mkdtemp())
     sst2_url = 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8'
     contents = requests.get(sst2_url)
     (self.test_dir / 'SST-2.zip').open('wb').write(contents.content)
     with ZipFile(self.test_dir / 'SST-2.zip', 'r') as zipObj:
         zipObj.extractall(self.test_dir)
     self.examples = self.processor.get_train_examples(self.test_dir /
                                                       'SST-2')
     self.base_tokenizer = BertTokenizer.from_pretrained(
         'bert-base-uncased', do_lower_case=True, cache_dir=self.test_dir)
     self.rust_tokenizer = PyBertTokenizer(get_from_cache(
         self.base_tokenizer.pretrained_vocab_files_map['vocab_file']
         ['bert-base-uncased']),
                                           do_lower_case=True)
Ejemplo n.º 6
0
 def setup_class(self):
     self.processor = Sst2Processor()
     self.test_dir = Path(tempfile.mkdtemp())
     sst2_url = 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8'
     contents = requests.get(sst2_url)
     (self.test_dir / 'SST-2.zip').open('wb').write(contents.content)
     with ZipFile(self.test_dir / 'SST-2.zip', 'r') as zipObj:
         zipObj.extractall(self.test_dir)
     self.examples = self.processor.get_train_examples(self.test_dir /
                                                       'SST-2')
     sentence_piece_url = 'https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model'
     contents = requests.get(sentence_piece_url)
     (self.test_dir / 'albert-base-v2-spiece.model').open('wb').write(
         contents.content)
     self.base_tokenizer = AlbertTokenizer.from_pretrained(
         str(self.test_dir / 'albert-base-v2-spiece.model'))
     self.rust_tokenizer = PyAlbertTokenizer(str(
         self.test_dir / 'albert-base-v2-spiece.model'),
                                             do_lower_case=True,
                                             keep_accents=False)