def _test_word_feature( self, feature_summarizer: FeatureCollator, window_function: WindowFunction = BagWindowFunction, feature_extractor_obj: FeatureExtractor = WordFeatureExtractor, context_window: int = 2, threshold: Optional[float] = None, ): dataset = WindowFunctionTest.create_fake_data() dataset_reader = BIODatasetReader(dataset) instances = dataset_reader.read('fake.txt') vocab = Vocabulary.from_instances(instances) feature_extractor = feature_extractor_obj(vocab=vocab) batch_func = window_function( positive_label='Tag', context_window=context_window, feature_extractor=feature_extractor, feature_summarizer=feature_summarizer, use_batch=True, threshold=threshold, ) sparse_batch_func = None if window_function == BagWindowFunction: sparse_batch_func = window_function( positive_label='Tag', context_window=context_window, feature_extractor=feature_extractor, feature_summarizer=feature_summarizer, use_batch=True, use_sparse=True, threshold=threshold, ) single_func = window_function( positive_label='Tag', context_window=context_window, feature_extractor=feature_extractor, feature_summarizer=feature_summarizer, use_batch=False, threshold=threshold, ) batch_func.train(dataset.data) single_func.train(dataset.data) if sparse_batch_func: sparse_eval = sparse_batch_func.train(dataset.data) assert batch_func.dictionary.shape[0] == batch_func.labels.shape[0] batch_eval = batch_func.evaluate(dataset) single_eval = single_func.evaluate(dataset) if sparse_batch_func: sparse_eval = sparse_batch_func.evaluate(dataset) assert batch_eval == sparse_eval assert batch_eval == single_eval return batch_eval
def train( model: Model, binary_class: str, train_data: DatasetType, valid_reader: DatasetReader, vocab: Vocabulary, optimizer_type: str, optimizer_learning_rate: float, optimizer_weight_decay: float, batch_size: int, patience: int, num_epochs: int, device: str, ) -> Tuple[Model, MetricsType]: train_reader = BIODatasetReader( ActiveBIODataset(train_data, dataset_id=0, binary_class=binary_class), token_indexers={ 'tokens': ELMoTokenCharactersIndexer(), }, ) train_dataset = train_reader.read('tmp.txt') valid_dataset = valid_reader.read('tmp.txt') cuda_device = -1 if device == 'cuda': cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 optimizer = optim.SGD( model.parameters(), lr=optimizer_learning_rate, weight_decay=optimizer_weight_decay, ) iterator = BucketIterator( batch_size=batch_size, sorting_keys=[("sentence", "num_tokens")], ) iterator.index_with(vocab) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=valid_dataset, patience=patience, num_epochs=num_epochs, cuda_device=cuda_device, validation_metric='f1-measure-overall', ) metrics = trainer.train() return model, metrics
def test_glove_feature_extractor(self): if not GLOVE_ENABLED: return dataset = FeatureExtractorTest.create_fake_data() dataset_reader = BIODatasetReader(dataset) instances = dataset_reader.read('fake.txt') vocab = Vocabulary.from_instances(instances) feature_extractor = GloVeFeatureExtractor() for entry in dataset.data: sentence = entry['input'] feats = feature_extractor.get_features(sentence_id=None, dataset_id=None, sentence=sentence) for word, feat in zip(sentence, feats): assert feat.shape == (1, 300)
def test_word_feature_extractor(self): dataset = FeatureExtractorTest.create_fake_data() dataset_reader = BIODatasetReader(dataset) instances = dataset_reader.read('fake.txt') vocab = Vocabulary.from_instances(instances) feature_extractor = WordFeatureExtractor(vocab=vocab) for entry in dataset.data: sentence = entry['input'] feats = feature_extractor.get_features(sentence_id=None, dataset_id=None, sentence=sentence) for word, feat in zip(sentence, feats): assert feat.shape == (1, vocab.get_vocab_size()) word_i = vocab.get_token_index(word) assert feat.sum() == 1 assert feat.argmax() == word_i
def test_cache_structure(self): bio_dataset = CachecTextFieldEmbedderTest.create_fake_data() reader = BIODatasetReader(bio_dataset=bio_dataset, ) instances = reader.read('fake_file.txt') vocab = Vocabulary.from_instances(instances) token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=CachecTextFieldEmbedderTest.EMBEDDING_DIM, ) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) cached_embedder = CachedTextFieldEmbedder( text_field_embedder=word_embeddings, ) cached_embedder.cache( dataset_id=bio_dataset.dataset_id, dataset=instances, vocab=vocab, ) def get_num_words(instances: Iterator[Instance]) -> int: num_words: int = 0 for inst in instances: num_words += len(inst['sentence']) return num_words num_words: int = get_num_words(instances) # only one dataset is cached assert len(cached_embedder.cached_datasets) == 1 # make sure the dataset id is cached assert 0 in cached_embedder.cached_datasets # make sure every entry is cached cd = cached_embedder.cached_datasets[0] assert len(cd.embedded_dataset) == num_words assert cd.embedded_dataset.shape == ( num_words, CachecTextFieldEmbedderTest.EMBEDDING_DIM) for inst in instances: s_id = inst['entry_id'].as_tensor(None).item() sent = inst['sentence'] assert s_id in cd.sid_to_start et = cd.get_embedding(s_id) assert et.shape == (len(sent), CachecTextFieldEmbedderTest.EMBEDDING_DIM)
def test_spacy_extractor(self): dataset = FeatureExtractorTest.create_fake_data() dataset_reader = BIODatasetReader(dataset) instances = dataset_reader.read('fake.txt') vocab = Vocabulary.from_instances(instances) feature_extractor = SpaCyFeatureExtractor() feature_extractor.cache( dataset_id=0, dataset=instances, vocab=vocab, ) features = feature_extractor.get_features(dataset_id=0, sentence_id=1) computed_features = SPACY_NLP(dataset.data[1]['input']) assert len(features) == len(dataset.data[1]['input']) assert len(features) == len(computed_features) for i, (f, c) in enumerate(zip(features, computed_features)): assert FeatureExtractorTest.features_eq(f, c)
def test_cache_forward(self, batch_size: int = 1): bio_dataset = CachecTextFieldEmbedderTest.create_fake_data() reader = BIODatasetReader(bio_dataset=bio_dataset, ) instances = reader.read('fake_file.txt') vocab = Vocabulary.from_instances(instances) token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=CachecTextFieldEmbedderTest.EMBEDDING_DIM, ) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) cached_embedder = CachedTextFieldEmbedder( text_field_embedder=word_embeddings, ) cached_embedder.cache( dataset_id=bio_dataset.dataset_id, dataset=instances, vocab=vocab, ) iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("sentence", "num_tokens")]) iterator.index_with(vocab) train_generator = iterator( instances, num_epochs=CachecTextFieldEmbedderTest.NUM_EPOCHS, shuffle=False) for inst in train_generator: cached_result = cached_embedder.forward( sentence=inst['sentence'], sentence_ids=inst['entry_id'], dataset_ids=inst['dataset_id'], use_cache=True, ) non_cached_result = cached_embedder.forward( sentence=inst['sentence'], sentence_ids=inst['entry_id'], dataset_ids=inst['dataset_id'], use_cache=False, ) assert cached_result.shape == non_cached_result.shape
def build_cache_cwr(cls): bio_dataset = cls.create_fake_data() reader = BIODatasetReader(bio_dataset=bio_dataset, ) instances = reader.read('fake_file.txt') vocab = Vocabulary.from_instances(instances) token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=cls.EMBEDDING_DIM, ) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) cached_embedder = CachedTextFieldEmbedder( text_field_embedder=word_embeddings, ) cached_embedder.cache( dataset_id=bio_dataset.dataset_id, dataset=instances, vocab=vocab, ) return cached_embedder
def test_pos_feature_extractor(self): dataset = FeatureExtractorTest.create_fake_data() dataset_reader = BIODatasetReader(dataset) instances = dataset_reader.read('fake.txt') vocab = Vocabulary.from_instances(instances) spacy_features = SpaCyFeatureExtractor() spacy_features.cache( dataset_id=0, dataset=instances, vocab=vocab, ) feature_extractor = POSFeatureExtractor(spacy_module=spacy_features) for entry in dataset.data: sentence = entry['input'] feats = feature_extractor.get_features( sentence_id=None, dataset_id=None, sentence=sentence, ) assert len(feats) == len(sentence) for word, feat in zip(sentence, feats): assert feat.shape == (1, len(SPACY_POS))
def test_multiclass(self): reader = BIODatasetReader( bio_dataset=BIODatasetTest.create_fake_data(), ) instances = reader.read('fake_file.txt') assert type(instances) == list expected_labels = [['B-Tag'], ['B-Tag', 'I-Tag']] fields = instances[0].fields tokens = [t.text for t in fields['sentence'].tokens] assert tokens == ['single'] assert fields['labels'].labels == expected_labels[0] assert fields['weight'] == 1.0 assert fields['entry_id'] == 0 fields = instances[1].fields tokens = [t.text for t in fields['sentence'].tokens] assert tokens == ['single', 'double'] assert fields['labels'].labels == expected_labels[1] assert fields['weight'] == 1.0 assert fields['entry_id'] == 1
def setup_embedder(cls, cache: bool = True) -> CachedTextFieldEmbedder: token_embedder, token_indexer = CWRFuncTest.get_embedder_info() train_bio = CWRFuncTest.create_fake_data('Tag') train_reader = BIODatasetReader( bio_dataset=train_bio, token_indexers={ 'tokens': token_indexer, }, ) train_data = train_reader.read('temp.txt') vocab = Vocabulary.from_instances(train_data) text_field_embedder = BasicTextFieldEmbedder( {"tokens": token_embedder}) cached_embedder = CachedTextFieldEmbedder( text_field_embedder=text_field_embedder, ) cached_embedder.cache( dataset_id=train_bio.dataset_id, dataset=train_data, vocab=vocab, ) return cached_embedder
def construct_vocab(datasets: List[BIODataset]) -> Vocabulary: readers = [BIODatasetReader( bio_dataset=bio_dataset, token_indexers={ 'tokens': ELMoTokenCharactersIndexer(), 'single_tokens': SingleIdTokenIndexer(), # including for future pipelines to use, one hot }, ) for bio_dataset in datasets] allennlp_datasets = [r.read('tmp.txt') for r in readers] result = allennlp_datasets[0] for i in range(1, len(allennlp_datasets)): result += allennlp_datasets[i] vocab = Vocabulary.from_instances(result) return vocab
def main(): args = get_args().parse_args() device = 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu' train_file, valid_file, test_file = get_dataset_files(dataset=args.dataset) train_bio = BIODataset( dataset_id=0, file_name=train_file, ) train_bio.parse_file() train_reader = BIODatasetReader(bio_dataset=train_bio, ) train_data: Iterator[Instance] = train_reader.read('temp.txt') valid_bio = BIODataset( dataset_id=1, file_name=valid_file, ) valid_bio.parse_file() valid_reader = BIODatasetReader(bio_dataset=valid_bio, ) valid_data: Iterator[Instance] = valid_reader.read('temp.txt') vocab = Vocabulary.from_instances(train_data + valid_data) if args.cuda: cuda_device = 0 cached_embedder = cached_embedder.cuda(cuda_device) else: cuda_device = -1 feature_extractor = SpaCyFeatureExtractor() feature_extractor.cache( dataset_id=0, dataset=train_data, vocab=vocab, ) feature_extractor.cache( dataset_id=1, dataset=valid_data, vocab=vocab, ) save_file_name = get_save_file( feature_extractor_type=args.feature_extractor, dataset_type=args.dataset) save_file = PickleSaveFile(file_name=save_file_name) feature_extractor.save(save_file=save_file) save_file.close()
def main(): args = get_args().parse_args() device = 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu' train_file, valid_file, test_file = get_dataset_files(dataset=args.dataset) token_embedder, token_indexer, text_field_embedder_kwargs = get_embedder_info( args.embedder) train_bio = BIODataset( dataset_id=0, file_name=train_file, ) train_bio.parse_file() train_reader = BIODatasetReader( bio_dataset=train_bio, token_indexers={ 'tokens': token_indexer, }, ) train_data = train_reader.read('temp.txt') valid_bio = BIODataset( dataset_id=1, file_name=valid_file, ) valid_bio.parse_file() valid_reader = BIODatasetReader( bio_dataset=valid_bio, token_indexers={ 'tokens': token_indexer, }, ) valid_data = valid_reader.read('temp.txt') vocab = Vocabulary.from_instances(train_data + valid_data) embedder = BasicTextFieldEmbedder({"tokens": token_embedder}, **text_field_embedder_kwargs) cached_embedder = CachedTextFieldEmbedder(text_field_embedder=embedder, ) if args.cuda: cuda_device = 0 cached_embedder = cached_embedder.cuda(cuda_device) else: cuda_device = -1 cached_embedder.cache( dataset_id=train_bio.dataset_id, dataset=train_data, vocab=vocab, cuda_device=cuda_device, ) cached_embedder.cache( dataset_id=valid_bio.dataset_id, dataset=valid_data, vocab=vocab, cuda_device=cuda_device, ) save_file_name = get_save_file(embedder_type=args.embedder, dataset_type=args.dataset) save_file = H5SaveFile(file_name=save_file_name) cached_embedder.save(save_file=save_file) save_file.close()
def active_train( model: Model, unlabeled_dataset: UnlabeledBIODataset, valid_dataset: BIODataset, vocab: Vocabulary, oracle: Oracle, optimizer_type: str, optimizer_learning_rate: float, optimizer_weight_decay: float, use_weak: bool, weak_fine_tune: bool, weak_weight: float, weak_function: List[str], weak_collator: str, sample_strategy: str, batch_size: int, patience: int, num_epochs: int, device: str, log_dir: str, model_name: str, ) -> Model: heuristic = ClusteringHeuristic(model.word_embeddings, unlabeled_dataset) # RandomHeuristic() log_dir = os.path.join(log_dir, model_name) logger = Logger(logdir=log_dir) # keep track of all the ids that have been # labeled labeled_indexes: List[int] = [] # the current training data that is being built up train_data: DatasetType = [] valid_reader = BIODatasetReader( bio_dataset=valid_dataset, token_indexers={ 'tokens': ELMoTokenCharactersIndexer(), }, ) cached_text_field_embedders: List[CachedTextFieldEmbedder] = get_all_embedders() spacy_feature_extractor: SpaCyFeatureExtractor = SpaCyFeatureExtractor.setup(dataset_ids=[0, 1]) spacy_feature_extractor.load(save_file=PickleSaveFile(CADEC_SPACY)) for i, sample_size in enumerate(ORACLE_SAMPLES): active_iteration_kwargs = dict( heuristic=heuristic, unlabeled_dataset=unlabeled_dataset, sample_size=sample_size, labeled_indexes=labeled_indexes, oracle=oracle, train_data=train_data, valid_reader=valid_reader, model=model, cached_text_field_embedders=cached_text_field_embedders, spacy_feature_extractor=spacy_feature_extractor, vocab=vocab, optimizer_type=optimizer_type, optimizer_learning_rate=optimizer_learning_rate, optimizer_weight_decay=optimizer_weight_decay, use_weak=use_weak, weak_weight=weak_weight, weak_function=weak_function, weak_collator=weak_collator, sample_strategy=sample_strategy, batch_size=batch_size, patience=patience, num_epochs=num_epochs, device=device, ) if weak_fine_tune: model, metrics = active_train_fine_tune_iteration(**active_iteration_kwargs) else: model, metrics = active_train_iteration(**active_iteration_kwargs) log_train_metrics(logger, metrics, step=len(train_data)) print(f'Finished experiment on training set size: {len(train_data)}') logger.flush()