def create_fake_data(cls, binary_class: Optional[str] = None) -> BIODataset: data = [ cls.create_entry(['single'], ['B-Tag'], 0, 1.0), cls.create_entry(['single', 'double'], ['B-Tag', 'I-Tag'], 1, 1.0), cls.create_entry(['single', 'double', 'triple'], ['B-Tag', 'I-Tag', 'O'], 2, 1.0), cls.create_entry(['no_label'], ['O'], 3, 1.0), ] dataset = BIODataset(0, 'fake_file.txt', binary_class) # hack around reading a file dataset.data = data return dataset
def create_fake_data(cls, binary_class: Optional[str] = None) -> BIODataset: data = [ create_entry(['this', 'is', 'an', 'reaction'], ['O', 'O', 'O', 'ADR'], 0), create_entry(['this', 'is', 'an', 'reaction', 'an', 'reaction'], ['O', 'O', 'O', 'O', 'O', 'ADR'], 1) ] dataset = BIODataset(0, 'fake_file.txt', binary_class) # hack around reading a file dataset.data = data return dataset
def main(): args = get_args().parse_args() device = 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu' train_file, valid_file, test_file = get_dataset_files(dataset=args.dataset) train_bio = BIODataset( dataset_id=0, file_name=train_file, ) train_bio.parse_file() train_reader = BIODatasetReader(bio_dataset=train_bio, ) train_data: Iterator[Instance] = train_reader.read('temp.txt') valid_bio = BIODataset( dataset_id=1, file_name=valid_file, ) valid_bio.parse_file() valid_reader = BIODatasetReader(bio_dataset=valid_bio, ) valid_data: Iterator[Instance] = valid_reader.read('temp.txt') vocab = Vocabulary.from_instances(train_data + valid_data) if args.cuda: cuda_device = 0 cached_embedder = cached_embedder.cuda(cuda_device) else: cuda_device = -1 feature_extractor = SpaCyFeatureExtractor() feature_extractor.cache( dataset_id=0, dataset=train_data, vocab=vocab, ) feature_extractor.cache( dataset_id=1, dataset=valid_data, vocab=vocab, ) save_file_name = get_save_file( feature_extractor_type=args.feature_extractor, dataset_type=args.dataset) save_file = PickleSaveFile(file_name=save_file_name) feature_extractor.save(save_file=save_file) save_file.close()
def main(): args = get_args().parse_args() device = 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu' train_file, valid_file, test_file = get_dataset_files(dataset=args.dataset) token_embedder, token_indexer, text_field_embedder_kwargs = get_embedder_info( args.embedder) train_bio = BIODataset( dataset_id=0, file_name=train_file, ) train_bio.parse_file() train_reader = BIODatasetReader( bio_dataset=train_bio, token_indexers={ 'tokens': token_indexer, }, ) train_data = train_reader.read('temp.txt') valid_bio = BIODataset( dataset_id=1, file_name=valid_file, ) valid_bio.parse_file() valid_reader = BIODatasetReader( bio_dataset=valid_bio, token_indexers={ 'tokens': token_indexer, }, ) valid_data = valid_reader.read('temp.txt') vocab = Vocabulary.from_instances(train_data + valid_data) embedder = BasicTextFieldEmbedder({"tokens": token_embedder}, **text_field_embedder_kwargs) cached_embedder = CachedTextFieldEmbedder(text_field_embedder=embedder, ) if args.cuda: cuda_device = 0 cached_embedder = cached_embedder.cuda(cuda_device) else: cuda_device = -1 cached_embedder.cache( dataset_id=train_bio.dataset_id, dataset=train_data, vocab=vocab, cuda_device=cuda_device, ) cached_embedder.cache( dataset_id=valid_bio.dataset_id, dataset=valid_data, vocab=vocab, cuda_device=cuda_device, ) save_file_name = get_save_file(embedder_type=args.embedder, dataset_type=args.dataset) save_file = H5SaveFile(file_name=save_file_name) cached_embedder.save(save_file=save_file) save_file.close()
def main(): args = get_active_args().parse_args() if args.debug: logging.basicConfig(level=logging.DEBUG) device = 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu' train_file, valid_file, test_file = get_dataset_files(dataset=args.dataset) class_labels: List[str] = construct_f1_class_labels(args.binary_class) train_bio = BIODataset( dataset_id=0, file_name=train_file, binary_class=args.binary_class, ) train_bio.parse_file() if args.test: print('using test set') valid_bio = BIODataset( dataset_id=1, file_name=valid_file if not args.test else test_file, binary_class=args.binary_class, ) valid_bio.parse_file() vocab = construct_vocab([train_bio, valid_bio]) unlabeled_corpus = UnlabeledBIODataset( dataset_id=train_bio.dataset_id, bio_data=train_bio, ) model = build_model( model_type=args.model_type, vocab=vocab, hidden_dim=args.hidden_dim, class_labels=class_labels, cached=args.cached, ) oracle = GoldOracle(train_bio) active_train( model=model, unlabeled_dataset=unlabeled_corpus, valid_dataset=valid_bio, vocab=vocab, oracle=oracle, optimizer_type=args.opt_type, optimizer_learning_rate=args.opt_lr, optimizer_weight_decay=args.opt_weight_decay, use_weak=args.use_weak, weak_fine_tune=args.use_weak_fine_tune, weak_weight=args.weak_weight, weak_function=args.weak_function, weak_collator=args.weak_collator, sample_strategy=args.sample_strategy, batch_size=args.batch_size, patience=args.patience, num_epochs=args.num_epochs, device=device, log_dir=args.log_dir, model_name=args.model_name, )
def main(): dataset = BIODataset(dataset_id=0, file_name=CADEC_VALID_ORIGINAL) dataset.parse_file() valid_data, test_data = create_split(dataset, 0.5) serialize_split(valid_data, CADEC_VALID) serialize_split(test_data, CADEC_TEST)