Example #1
0
    def _test_word_feature(
        self,
        feature_summarizer: FeatureCollator,
        window_function: WindowFunction = BagWindowFunction,
        feature_extractor_obj: FeatureExtractor = WordFeatureExtractor,
        context_window: int = 2,
        threshold: Optional[float] = None,
    ):
        dataset = WindowFunctionTest.create_fake_data()
        dataset_reader = BIODatasetReader(dataset)
        instances = dataset_reader.read('fake.txt')
        vocab = Vocabulary.from_instances(instances)
        feature_extractor = feature_extractor_obj(vocab=vocab)
        batch_func = window_function(
            positive_label='Tag',
            context_window=context_window,
            feature_extractor=feature_extractor,
            feature_summarizer=feature_summarizer,
            use_batch=True,
            threshold=threshold,
        )

        sparse_batch_func = None
        if window_function == BagWindowFunction:
            sparse_batch_func = window_function(
                positive_label='Tag',
                context_window=context_window,
                feature_extractor=feature_extractor,
                feature_summarizer=feature_summarizer,
                use_batch=True,
                use_sparse=True,
                threshold=threshold,
            )

        single_func = window_function(
            positive_label='Tag',
            context_window=context_window,
            feature_extractor=feature_extractor,
            feature_summarizer=feature_summarizer,
            use_batch=False,
            threshold=threshold,
        )

        batch_func.train(dataset.data)
        single_func.train(dataset.data)
        if sparse_batch_func:
            sparse_eval = sparse_batch_func.train(dataset.data)

        assert batch_func.dictionary.shape[0] == batch_func.labels.shape[0]

        batch_eval = batch_func.evaluate(dataset)
        single_eval = single_func.evaluate(dataset)
        if sparse_batch_func:
            sparse_eval = sparse_batch_func.evaluate(dataset)
            assert batch_eval == sparse_eval

        assert batch_eval == single_eval

        return batch_eval
Example #2
0
def train(
    model: Model,
    binary_class: str,
    train_data: DatasetType,
    valid_reader: DatasetReader,
    vocab: Vocabulary,
    optimizer_type: str,
    optimizer_learning_rate: float,
    optimizer_weight_decay: float,
    batch_size: int,
    patience: int,
    num_epochs: int,
    device: str,
) -> Tuple[Model, MetricsType]:
    train_reader = BIODatasetReader(
        ActiveBIODataset(train_data, dataset_id=0, binary_class=binary_class),
        token_indexers={
            'tokens': ELMoTokenCharactersIndexer(),
        },
    )

    train_dataset = train_reader.read('tmp.txt')
    valid_dataset = valid_reader.read('tmp.txt')

    cuda_device = -1

    if device == 'cuda':
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    optimizer = optim.SGD(
        model.parameters(),
        lr=optimizer_learning_rate,
        weight_decay=optimizer_weight_decay,
    )

    iterator = BucketIterator(
        batch_size=batch_size,
        sorting_keys=[("sentence", "num_tokens")],
    )

    iterator.index_with(vocab)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=valid_dataset,
        patience=patience,
        num_epochs=num_epochs,
        cuda_device=cuda_device,
        validation_metric='f1-measure-overall',
    )
    metrics = trainer.train()

    return model, metrics
Example #3
0
 def test_glove_feature_extractor(self):
     if not GLOVE_ENABLED:
         return
     dataset = FeatureExtractorTest.create_fake_data()
     dataset_reader = BIODatasetReader(dataset)
     instances = dataset_reader.read('fake.txt')
     vocab = Vocabulary.from_instances(instances)
     feature_extractor = GloVeFeatureExtractor()
     for entry in dataset.data:
         sentence = entry['input']
         feats = feature_extractor.get_features(sentence_id=None, dataset_id=None, sentence=sentence)
         for word, feat in zip(sentence, feats):
             assert feat.shape == (1, 300)
Example #4
0
 def test_word_feature_extractor(self):
     dataset = FeatureExtractorTest.create_fake_data()
     dataset_reader = BIODatasetReader(dataset)
     instances = dataset_reader.read('fake.txt')
     vocab = Vocabulary.from_instances(instances)
     feature_extractor = WordFeatureExtractor(vocab=vocab)
     for entry in dataset.data:
         sentence = entry['input']
         feats = feature_extractor.get_features(sentence_id=None, dataset_id=None, sentence=sentence)
         for word, feat in zip(sentence, feats):
             assert feat.shape == (1, vocab.get_vocab_size())
             word_i = vocab.get_token_index(word)
             assert feat.sum() == 1
             assert feat.argmax() == word_i
Example #5
0
    def test_cache_structure(self):
        bio_dataset = CachecTextFieldEmbedderTest.create_fake_data()
        reader = BIODatasetReader(bio_dataset=bio_dataset, )

        instances = reader.read('fake_file.txt')
        vocab = Vocabulary.from_instances(instances)
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=CachecTextFieldEmbedderTest.EMBEDDING_DIM,
        )
        word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
        cached_embedder = CachedTextFieldEmbedder(
            text_field_embedder=word_embeddings, )

        cached_embedder.cache(
            dataset_id=bio_dataset.dataset_id,
            dataset=instances,
            vocab=vocab,
        )

        def get_num_words(instances: Iterator[Instance]) -> int:
            num_words: int = 0
            for inst in instances:
                num_words += len(inst['sentence'])
            return num_words

        num_words: int = get_num_words(instances)

        # only one dataset is cached
        assert len(cached_embedder.cached_datasets) == 1
        # make sure the dataset id is cached
        assert 0 in cached_embedder.cached_datasets
        # make sure every entry is cached
        cd = cached_embedder.cached_datasets[0]
        assert len(cd.embedded_dataset) == num_words
        assert cd.embedded_dataset.shape == (
            num_words, CachecTextFieldEmbedderTest.EMBEDDING_DIM)

        for inst in instances:
            s_id = inst['entry_id'].as_tensor(None).item()
            sent = inst['sentence']
            assert s_id in cd.sid_to_start
            et = cd.get_embedding(s_id)
            assert et.shape == (len(sent),
                                CachecTextFieldEmbedderTest.EMBEDDING_DIM)
Example #6
0
    def test_spacy_extractor(self):
        dataset = FeatureExtractorTest.create_fake_data()
        dataset_reader = BIODatasetReader(dataset)
        instances = dataset_reader.read('fake.txt')
        vocab = Vocabulary.from_instances(instances)
        feature_extractor = SpaCyFeatureExtractor()
        feature_extractor.cache(
            dataset_id=0,
            dataset=instances,
            vocab=vocab,
        )

        features = feature_extractor.get_features(dataset_id=0, sentence_id=1)
        computed_features = SPACY_NLP(dataset.data[1]['input'])
        assert len(features) == len(dataset.data[1]['input'])
        assert len(features) == len(computed_features)
        for i, (f, c) in enumerate(zip(features, computed_features)):
            assert FeatureExtractorTest.features_eq(f, c)
Example #7
0
    def test_cache_forward(self, batch_size: int = 1):
        bio_dataset = CachecTextFieldEmbedderTest.create_fake_data()
        reader = BIODatasetReader(bio_dataset=bio_dataset, )

        instances = reader.read('fake_file.txt')
        vocab = Vocabulary.from_instances(instances)
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=CachecTextFieldEmbedderTest.EMBEDDING_DIM,
        )
        word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
        cached_embedder = CachedTextFieldEmbedder(
            text_field_embedder=word_embeddings, )

        cached_embedder.cache(
            dataset_id=bio_dataset.dataset_id,
            dataset=instances,
            vocab=vocab,
        )

        iterator = BucketIterator(batch_size=batch_size,
                                  sorting_keys=[("sentence", "num_tokens")])
        iterator.index_with(vocab)
        train_generator = iterator(
            instances,
            num_epochs=CachecTextFieldEmbedderTest.NUM_EPOCHS,
            shuffle=False)
        for inst in train_generator:
            cached_result = cached_embedder.forward(
                sentence=inst['sentence'],
                sentence_ids=inst['entry_id'],
                dataset_ids=inst['dataset_id'],
                use_cache=True,
            )

            non_cached_result = cached_embedder.forward(
                sentence=inst['sentence'],
                sentence_ids=inst['entry_id'],
                dataset_ids=inst['dataset_id'],
                use_cache=False,
            )

            assert cached_result.shape == non_cached_result.shape
    def build_cache_cwr(cls):
        bio_dataset = cls.create_fake_data()
        reader = BIODatasetReader(bio_dataset=bio_dataset, )

        instances = reader.read('fake_file.txt')
        vocab = Vocabulary.from_instances(instances)
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=cls.EMBEDDING_DIM,
        )
        word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
        cached_embedder = CachedTextFieldEmbedder(
            text_field_embedder=word_embeddings, )

        cached_embedder.cache(
            dataset_id=bio_dataset.dataset_id,
            dataset=instances,
            vocab=vocab,
        )

        return cached_embedder
Example #9
0
 def test_pos_feature_extractor(self):
     dataset = FeatureExtractorTest.create_fake_data()
     dataset_reader = BIODatasetReader(dataset)
     instances = dataset_reader.read('fake.txt')
     vocab = Vocabulary.from_instances(instances)
     spacy_features = SpaCyFeatureExtractor()
     spacy_features.cache(
         dataset_id=0,
         dataset=instances,
         vocab=vocab,
     )
     feature_extractor = POSFeatureExtractor(spacy_module=spacy_features)
     for entry in dataset.data:
         sentence = entry['input']
         feats = feature_extractor.get_features(
             sentence_id=None,
             dataset_id=None,
             sentence=sentence,
         )
         assert len(feats) == len(sentence)
         for word, feat in zip(sentence, feats):
             assert feat.shape == (1, len(SPACY_POS))
Example #10
0
    def test_multiclass(self):
        reader = BIODatasetReader(
            bio_dataset=BIODatasetTest.create_fake_data(), )

        instances = reader.read('fake_file.txt')

        assert type(instances) == list

        expected_labels = [['B-Tag'], ['B-Tag', 'I-Tag']]

        fields = instances[0].fields
        tokens = [t.text for t in fields['sentence'].tokens]
        assert tokens == ['single']
        assert fields['labels'].labels == expected_labels[0]
        assert fields['weight'] == 1.0
        assert fields['entry_id'] == 0

        fields = instances[1].fields
        tokens = [t.text for t in fields['sentence'].tokens]
        assert tokens == ['single', 'double']
        assert fields['labels'].labels == expected_labels[1]
        assert fields['weight'] == 1.0
        assert fields['entry_id'] == 1
Example #11
0
    def setup_embedder(cls, cache: bool = True) -> CachedTextFieldEmbedder:
        token_embedder, token_indexer = CWRFuncTest.get_embedder_info()
        train_bio = CWRFuncTest.create_fake_data('Tag')
        train_reader = BIODatasetReader(
            bio_dataset=train_bio,
            token_indexers={
                'tokens': token_indexer,
            },
        )

        train_data = train_reader.read('temp.txt')
        vocab = Vocabulary.from_instances(train_data)
        text_field_embedder = BasicTextFieldEmbedder(
            {"tokens": token_embedder})
        cached_embedder = CachedTextFieldEmbedder(
            text_field_embedder=text_field_embedder, )

        cached_embedder.cache(
            dataset_id=train_bio.dataset_id,
            dataset=train_data,
            vocab=vocab,
        )

        return cached_embedder
Example #12
0
def construct_vocab(datasets: List[BIODataset]) -> Vocabulary:
    readers = [BIODatasetReader(
        bio_dataset=bio_dataset,
        token_indexers={
            'tokens': ELMoTokenCharactersIndexer(),
            'single_tokens': SingleIdTokenIndexer(), # including for future pipelines to use, one hot
        },
    ) for bio_dataset in datasets]

    allennlp_datasets = [r.read('tmp.txt') for r in readers]

    result = allennlp_datasets[0]
    for i in range(1, len(allennlp_datasets)):
        result += allennlp_datasets[i]

    vocab = Vocabulary.from_instances(result)

    return vocab
Example #13
0
def main():
    args = get_args().parse_args()
    device = 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu'
    train_file, valid_file, test_file = get_dataset_files(dataset=args.dataset)

    train_bio = BIODataset(
        dataset_id=0,
        file_name=train_file,
    )
    train_bio.parse_file()

    train_reader = BIODatasetReader(bio_dataset=train_bio, )

    train_data: Iterator[Instance] = train_reader.read('temp.txt')

    valid_bio = BIODataset(
        dataset_id=1,
        file_name=valid_file,
    )
    valid_bio.parse_file()

    valid_reader = BIODatasetReader(bio_dataset=valid_bio, )

    valid_data: Iterator[Instance] = valid_reader.read('temp.txt')
    vocab = Vocabulary.from_instances(train_data + valid_data)

    if args.cuda:
        cuda_device = 0
        cached_embedder = cached_embedder.cuda(cuda_device)
    else:
        cuda_device = -1

    feature_extractor = SpaCyFeatureExtractor()
    feature_extractor.cache(
        dataset_id=0,
        dataset=train_data,
        vocab=vocab,
    )
    feature_extractor.cache(
        dataset_id=1,
        dataset=valid_data,
        vocab=vocab,
    )

    save_file_name = get_save_file(
        feature_extractor_type=args.feature_extractor,
        dataset_type=args.dataset)

    save_file = PickleSaveFile(file_name=save_file_name)

    feature_extractor.save(save_file=save_file)
    save_file.close()
Example #14
0
def main():
    args = get_args().parse_args()
    device = 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu'
    train_file, valid_file, test_file = get_dataset_files(dataset=args.dataset)
    token_embedder, token_indexer, text_field_embedder_kwargs = get_embedder_info(
        args.embedder)

    train_bio = BIODataset(
        dataset_id=0,
        file_name=train_file,
    )
    train_bio.parse_file()

    train_reader = BIODatasetReader(
        bio_dataset=train_bio,
        token_indexers={
            'tokens': token_indexer,
        },
    )

    train_data = train_reader.read('temp.txt')

    valid_bio = BIODataset(
        dataset_id=1,
        file_name=valid_file,
    )
    valid_bio.parse_file()

    valid_reader = BIODatasetReader(
        bio_dataset=valid_bio,
        token_indexers={
            'tokens': token_indexer,
        },
    )

    valid_data = valid_reader.read('temp.txt')

    vocab = Vocabulary.from_instances(train_data + valid_data)
    embedder = BasicTextFieldEmbedder({"tokens": token_embedder},
                                      **text_field_embedder_kwargs)
    cached_embedder = CachedTextFieldEmbedder(text_field_embedder=embedder, )

    if args.cuda:
        cuda_device = 0
        cached_embedder = cached_embedder.cuda(cuda_device)
    else:
        cuda_device = -1

    cached_embedder.cache(
        dataset_id=train_bio.dataset_id,
        dataset=train_data,
        vocab=vocab,
        cuda_device=cuda_device,
    )

    cached_embedder.cache(
        dataset_id=valid_bio.dataset_id,
        dataset=valid_data,
        vocab=vocab,
        cuda_device=cuda_device,
    )

    save_file_name = get_save_file(embedder_type=args.embedder,
                                   dataset_type=args.dataset)

    save_file = H5SaveFile(file_name=save_file_name)

    cached_embedder.save(save_file=save_file)
    save_file.close()
Example #15
0
def active_train(
    model: Model,
    unlabeled_dataset: UnlabeledBIODataset,
    valid_dataset: BIODataset,
    vocab: Vocabulary,
    oracle: Oracle,
    optimizer_type: str,
    optimizer_learning_rate: float,
    optimizer_weight_decay: float,
    use_weak: bool,
    weak_fine_tune: bool,
    weak_weight: float,
    weak_function: List[str],
    weak_collator: str,
    sample_strategy: str,
    batch_size: int,
    patience: int,
    num_epochs: int,
    device: str,
    log_dir: str,
    model_name: str,
) -> Model:
    heuristic =  ClusteringHeuristic(model.word_embeddings, unlabeled_dataset) # RandomHeuristic()

    log_dir = os.path.join(log_dir, model_name)
    logger = Logger(logdir=log_dir)

    # keep track of all the ids that have been
    # labeled
    labeled_indexes: List[int] = []

    # the current training data that is being built up
    train_data: DatasetType = []

    valid_reader = BIODatasetReader(
        bio_dataset=valid_dataset,
        token_indexers={
            'tokens': ELMoTokenCharactersIndexer(),
        },
    )

    cached_text_field_embedders: List[CachedTextFieldEmbedder] = get_all_embedders()
    spacy_feature_extractor: SpaCyFeatureExtractor = SpaCyFeatureExtractor.setup(dataset_ids=[0, 1])
    spacy_feature_extractor.load(save_file=PickleSaveFile(CADEC_SPACY))

    for i, sample_size in enumerate(ORACLE_SAMPLES):
        active_iteration_kwargs = dict(
            heuristic=heuristic,
            unlabeled_dataset=unlabeled_dataset,
            sample_size=sample_size,
            labeled_indexes=labeled_indexes,
            oracle=oracle,
            train_data=train_data,
            valid_reader=valid_reader,
            model=model,
            cached_text_field_embedders=cached_text_field_embedders,
            spacy_feature_extractor=spacy_feature_extractor,
            vocab=vocab,
            optimizer_type=optimizer_type,
            optimizer_learning_rate=optimizer_learning_rate,
            optimizer_weight_decay=optimizer_weight_decay,
            use_weak=use_weak,
            weak_weight=weak_weight,
            weak_function=weak_function,
            weak_collator=weak_collator,
            sample_strategy=sample_strategy,
            batch_size=batch_size,
            patience=patience,
            num_epochs=num_epochs,
            device=device,
        )

        if weak_fine_tune:
            model, metrics = active_train_fine_tune_iteration(**active_iteration_kwargs)
        else:
            model, metrics = active_train_iteration(**active_iteration_kwargs)

        log_train_metrics(logger, metrics, step=len(train_data))

        print(f'Finished experiment on training set size: {len(train_data)}')
    logger.flush()