def get_dataloaders( data_dir, task_name="MultiRC", splits=["train", "val", "test"], max_data_samples=None, max_sequence_length=256, tokenizer_name="xlnet-base-cased", batch_size=16, augment=False, uid="uids", ): """Load data and return dataloaders""" dataloaders = [] tokenizer = get_tokenizer(tokenizer_name) for split in splits: jsonl_path = os.path.join( data_dir, task_name, SuperGLUE_TASK_SPLIT_MAPPING[task_name][split]) dataset = parsers.parser[task_name](jsonl_path, tokenizer, uid, max_data_samples, max_sequence_length) dataloader = EmmentalDataLoader( task_to_label_dict={task_name: "labels"}, dataset=dataset, split=split, batch_size=batch_size, shuffle=(split == "train"), ) dataloaders.append(dataloader) if (augment and split == "train" and task_name in augmentation.augmentation_funcs): augmentation_funcs = augmentation.augmentation_funcs[task_name] for af in augmentation_funcs: dataset = af(dataset) dataloader = EmmentalDataLoader( task_to_label_dict={task_name: "labels"}, dataset=dataset, split=split, batch_size=batch_size, shuffle=(split == "train"), ) dataloaders.append(dataloader) logger.info( f"Loaded {split} for {task_name} with {len(dataset)} samples.") return dataloaders
def create_dataloaders(task_name, dataset, batch_size, word2id, oov="~#OoV#~"): # Create dataloaders oov_id = word2id[oov] dataloaders = [] for split in ["train", "valid", "test"]: split_x, split_y = dataset[split] split_x = [ torch.LongTensor([word2id.get(w, oov_id) for w in seq]) for seq in split_x ] dataloaders.append( EmmentalDataLoader( task_to_label_dict={task_name: "label"}, dataset=EmmentalDataset( name=task_name, X_dict={"feature": split_x}, Y_dict={"label": split_y}, ), split=split, batch_size=batch_size, shuffle=True if split == "train" else False, )) logger.info( f"Loaded {split} for {task_name} containing {len(split_x)} samples." ) return dataloaders
def _init_dataloaders(self, _log, dataloader_configs, sampler_configs, task_to_label_dict): dataloaders = [] for split in ['train', 'valid']: dataloader_config = dataloader_configs[split] if split == 'train': sampler_class = sampler_configs[split]['class_name'] sampler_args = sampler_configs[split]['args'] if sampler_class == 'WeightedRandomSampler': weights = get_sample_weights(self.datasets[split], sampler_args['weight_task'], sampler_args['class_probs']) sampler = getattr(torch_data, sampler_class)( weights=weights, num_samples=sampler_args['num_samples'], replacement=sampler_args['replacement']) else: sampler = getattr(torch_data, sampler_class)( data_source=self.datasets[split], **sampler_args) dataloader_config = {'sampler': sampler, **dataloader_config} dl = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=self.datasets[split], split=split, **dataloader_config, ) dataloaders.append(dl) _log.info(f'Built dataloader for {split} set.') return dataloaders
def get_dataloaders(args): dataloaders = [] datasets = {} for split in ["train", "test"]: if split == "train": datasets[split] = ALL_DATASETS[args.task](args.task, args, split, index=None, k=args.augment_k, model=args.model) elif split == "test": datasets[split] = ALL_DATASETS[args.task](args.task, args, split, model=args.model) for split, dataset in datasets.items(): dataloaders.append( EmmentalDataLoader( task_to_label_dict={args.task: "labels"}, dataset=dataset, split=split, shuffle=True if split in ["train"] else False, batch_size=args.batch_size if split in args.train_split or args.valid_batch_size is None else args.valid_batch_size, num_workers=1, )) logger.info( f"Built dataloader for {args.task} {split} set with {len(dataset)} " f"samples (Shuffle={split in args.train_split}, " f"Batch size={dataloaders[-1].batch_size}).") return dataloaders
def test_mixed_scheduler(caplog): """Unit test of mixed scheduler""" caplog.set_level(logging.INFO) emmental.Meta.init() task1 = "task1" x1 = np.random.rand(20, 2) y1 = torch.from_numpy(np.random.rand(20)) task2 = "task2" x2 = np.random.rand(30, 3) y2 = torch.from_numpy(np.random.rand(30)) dataloaders = [ EmmentalDataLoader( task_to_label_dict={task_name: "label"}, dataset=EmmentalDataset(name=task_name, X_dict={"feature": x}, Y_dict={"label": y}), split="train", batch_size=10, shuffle=True, ) for task_name, x, y in [(task1, x1, y1), (task2, x2, y2)] ] scheduler = MixedScheduler() assert scheduler.get_num_batches(dataloaders) == 2 batch_task_names_1 = [ batch_data[0][-2] for batch_data in scheduler.get_batches(dataloaders) ] batch_task_names_2 = [ batch_data[1][-2] for batch_data in scheduler.get_batches(dataloaders) ] assert batch_task_names_1 == [task1, task1] assert batch_task_names_2 == [task2, task2] scheduler = MixedScheduler(fillup=True) assert scheduler.get_num_batches(dataloaders) == 3 batch_task_names_1 = [ batch_data[0][-2] for batch_data in scheduler.get_batches(dataloaders) ] batch_task_names_2 = [ batch_data[1][-2] for batch_data in scheduler.get_batches(dataloaders) ] assert batch_task_names_1 == [task1, task1, task1] assert batch_task_names_2 == [task2, task2, task2]
def _init_dataloaders(self, _log, dataloader_configs, task_to_label_dict): dataloaders = [] for split in ['train', 'valid']: dataloaders.append( EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=self.datasets[split], split=split, **dataloader_configs[split], )) _log.info(f'Built dataloader for {split} set.') return dataloaders
def test_round_robin_scheduler(caplog): """Unit test of round robin scheduler.""" caplog.set_level(logging.INFO) emmental.Meta.init() # Set random seed seed set_random_seed(2) task1 = "task1" x1 = np.random.rand(20, 2) y1 = torch.from_numpy(np.random.rand(20)) task2 = "task2" x2 = np.random.rand(30, 3) y2 = torch.from_numpy(np.random.rand(30)) dataloaders = [ EmmentalDataLoader( task_to_label_dict={task_name: "label"}, dataset=EmmentalDataset( name=task_name, X_dict={"feature": x}, Y_dict={"label": y} ), split="train", batch_size=10, shuffle=True, ) for task_name, x, y in [(task1, x1, y1), (task2, x2, y2)] ] scheduler = RoundRobinScheduler() assert scheduler.get_num_batches(dataloaders) == 5 batch_task_names = [ batch_data[-2] for batch_data in scheduler.get_batches(dataloaders) ] assert batch_task_names == [task2, task1, task2, task2, task1] scheduler = RoundRobinScheduler(fillup=True) assert scheduler.get_num_batches(dataloaders) == 6 batch_task_names = [ batch_data[-2] for batch_data in scheduler.get_batches(dataloaders) ] assert batch_task_names == [task2, task1, task2, task2, task1, task1]
def _init_dataloaders(self, _log, dataloader_configs, task_to_label_dict): dataloaders = [] for split in ['train', 'test']: dataloaders.append( EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=self.datasets[split], split=split, shuffle=dataloader_configs[split]['shuffle'], batch_size=dataloader_configs[split]['batch_size'], num_workers=dataloader_configs[split]['num_workers'], ) ) _log.info(f'Built dataloader for {self.datasets[split].name} {split} set.') return dataloaders
def _classify(self, doc: Document) -> DataFrame: # Only one candidate class is defined. candidate_class = self.candidate_extractor.candidate_classes[0] test_cands = getattr(doc, candidate_class.__tablename__ + "s") if self.model_type == "emmental": # Featurization features_list = self.featurizer.apply(doc) # Convert features into a sparse matrix F_test = _F_matrix(features_list[0], self.key_names) # Dataloader for test ATTRIBUTE = "wiki" test_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset(ATTRIBUTE, test_cands, F_test, self.word2id, 2), split="test", batch_size=100, shuffle=False, ) test_preds = self.emmental_model.predict(test_dataloader, return_preds=True) positive = np.where( np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.6) true_preds = [test_cands[_] for _ in positive[0]] else: labels_list = self.labeler.apply(doc, lfs=self.lfs) L_test = _L_matrix(labels_list[0], self.key_names) marginals = self.label_models[0].predict_proba(L_test) for cand, prob in zip(test_cands, marginals[:, 1]): cand.prob = prob true_preds = sorted(test_cands, key=lambda cand: cand.prob, reverse=True) df = DataFrame() for entity_relation in get_unique_entity_relations(true_preds): df = df.append( DataFrame( [entity_relation], columns=[m.__name__ for m in candidate_class.mentions])) return df
def get_dataloaders(args): task = "TACRED" tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=True if "uncased" in args.bert_model else False) datasets = {} for split in ["train", "dev", "test"]: if split == "train": logger.info(f"Loading {split} from " f"{os.path.join(args.data_dir, f'{split}_ent.json')}.") data = load_json(os.path.join(args.data_dir, f"{split}_ent.json")) else: logger.info(f"Loading {split} from " f"{os.path.join(args.data_dir, f'{split}_ent.json')}.") data = load_json(os.path.join(args.data_dir, f"{split}_ent.json")) datasets[split] = TACREDDataset( task, data, tokenizer=tokenizer, split=split, mode=args.feature_mode, max_seq_length=args.max_seq_length, encode_first=args.encode_first, ) dataloaders = [] for split, dataset in datasets.items(): dataloaders.append( EmmentalDataLoader( task_to_label_dict={task: "labels"}, dataset=dataset, split=split, shuffle=True if split in ["train"] else False, batch_size=args.batch_size if split in args.train_split or args.valid_batch_size is None else args.valid_batch_size, num_workers=4, )) logger.info(f"Built dataloader for {split} set with {len(dataset)} " f"samples (Shuffle={split in args.train_split}, " f"Batch size={dataloaders[-1].batch_size}).") return dataloaders
def _init_dataloaders(self, _log, dataloader_configs, sampler_configs, task_to_label_dict): dataloaders = [] for split in ['train', 'valid']: dataloader_config = dataloader_configs[split] if split == 'train': dataloader_config = { 'sampler': RandomSampler(data_source=self.datasets[split], **sampler_configs['train']), **dataloader_config } dl = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=self.datasets[split], split=split, **dataloader_config, ) dataloaders.append(dl) _log.info(f'Built dataloader for {split} set.') return dataloaders
def get_dataloaders(args): train_dataset = torchvision.datasets.__dict__[args.task.upper()]( root=args.data, train=True, download=True) test_dataset = torchvision.datasets.__dict__[args.task.upper()]( root=args.data, train=False, download=True) dataloaders = [] datasets = {} for split in ["train", "test"]: if split == "train": datasets[split] = ALL_DATASETS[args.task]( args.task, train_dataset, split, index=None, prob_label=True, k=args.augment_k, ) elif split == "test": datasets[split] = ALL_DATASETS[args.task](args.task, test_dataset, split) for split, dataset in datasets.items(): dataloaders.append( EmmentalDataLoader( task_to_label_dict={args.task: "labels"}, dataset=dataset, split=split, shuffle=True if split in ["train"] else False, batch_size=args.batch_size if split in args.train_split or args.valid_batch_size is None else args.valid_batch_size, num_workers=4, )) logger.info( f"Built dataloader for {args.task} {split} set with {len(dataset)} " f"samples (Shuffle={split in args.train_split}, " f"Batch size={dataloaders[-1].batch_size}).") return dataloaders
def _classify(self, doc: Document) -> DataFrame: # Only one candidate class is used. candidate_class = self.candidate_extractor.candidate_classes[0] test_cands = getattr(doc, candidate_class.__tablename__ + "s") features_list = self.featurizer.apply(doc) # Convert features into a sparse matrix F_test = FonduerModel.convert_features_to_matrix( features_list[0], self.key_names) test_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset(ATTRIBUTE, test_cands, F_test, self.word2id, 2), split="test", batch_size=100, shuffle=False, ) test_preds = self.emmental_model.predict(test_dataloader, return_preds=True) positive = np.where( np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.7) true_preds = [test_cands[_] for _ in positive[0]] pickle_file = "tests/data/parts_by_doc_dict.pkl" with open(pickle_file, "rb") as f: parts_by_doc = pickle.load(f) df = DataFrame() for c in true_preds: part = c[0].context.get_span() doc = c[0].context.sentence.document.name.upper() val = c[1].context.get_span() for p in get_implied_parts(part, doc, parts_by_doc): entity_relation = (doc, p, val) df = df.append( DataFrame([entity_relation], columns=["doc", "part", "val"])) return df
def test_emmental_dataloader(caplog): """Unit test of emmental dataloader.""" caplog.set_level(logging.INFO) dirpath = "temp_test_data" Meta.reset() emmental.init(dirpath) x1 = [ torch.Tensor([1]), torch.Tensor([1, 2]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3, 4, 5]), ] y1 = torch.Tensor([0, 0, 0, 0, 0]) x2 = [ torch.Tensor([1, 2, 3, 4, 5]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2]), torch.Tensor([1]), ] y2 = torch.Tensor([1, 1, 1, 1, 1]) dataset = EmmentalDataset( X_dict={ "data1": x1, "data2": x2 }, Y_dict={ "label1": y1, "label2": y2 }, name="new_data", ) dataloader1 = EmmentalDataLoader( task_to_label_dict={"task1": "label1"}, dataset=dataset, split="train", batch_size=2, num_workers=2, ) x_batch, y_batch = next(iter(dataloader1)) # Check if the dataloader is correctly constructed assert dataloader1.task_to_label_dict == {"task1": "label1"} assert dataloader1.split == "train" assert torch.equal(x_batch["data1"], torch.Tensor([[1, 0], [1, 2]])) assert torch.equal(x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]])) assert torch.equal(y_batch["label1"], torch.Tensor([0, 0])) assert torch.equal(y_batch["label2"], torch.Tensor([1, 1])) dataloader2 = EmmentalDataLoader( task_to_label_dict={"task2": "label2"}, dataset=dataset, split="test", batch_size=3, collate_fn=partial(emmental_collate_fn, min_data_len=0, max_data_len=0), ) x_batch, y_batch = next(iter(dataloader2)) # Check if the dataloader with different batch size is correctly constructed assert dataloader2.task_to_label_dict == {"task2": "label2"} assert dataloader2.split == "test" assert torch.equal(x_batch["data1"], torch.Tensor([[1, 0, 0], [1, 2, 0], [1, 2, 3]])) assert torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]), ) assert torch.equal(y_batch["label1"], torch.Tensor([0, 0, 0])) assert torch.equal(y_batch["label2"], torch.Tensor([1, 1, 1])) y3 = [ torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), ] dataset.Y_dict["label2"] = y3 x_batch, y_batch = next(iter(dataloader1)) # Check dataloader is correctly updated with update dataset assert torch.equal(x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]])) assert torch.equal(y_batch["label2"], torch.Tensor([[2], [2]])) x_batch, y_batch = next(iter(dataloader2)) assert torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]), ) assert torch.equal(y_batch["label2"], torch.Tensor([[2], [2], [2]])) dataset = EmmentalDataset(X_dict={"data1": x1}, name="new_data") dataloader3 = EmmentalDataLoader(task_to_label_dict={"task1": None}, dataset=dataset, split="train", batch_size=2) x_batch = next(iter(dataloader3)) # Check if the dataloader is correctly constructed assert dataloader3.task_to_label_dict == {"task1": None} assert dataloader3.split == "train" assert torch.equal(x_batch["data1"], torch.Tensor([[1, 0], [1, 2]])) # Check there is an error if task_to_label_dict has task to label mapping while # no y_dict in dataset with pytest.raises(ValueError): EmmentalDataLoader( task_to_label_dict={"task1": "label1"}, dataset=dataset, split="train", batch_size=2, ) shutil.rmtree(dirpath)
def test_e2e(): """Run an end-to-end test on documents of the hardware domain.""" # GitHub Actions gives 2 cores # help.github.com/en/actions/reference/virtual-environments-for-github-hosted-runners PARALLEL = 2 max_docs = 12 fonduer.init_logging( format="[%(asctime)s][%(levelname)s] %(name)s:%(lineno)s - %(message)s", level=logging.INFO, ) session = fonduer.Meta.init(CONN_STRING).Session() docs_path = "tests/data/html/" pdf_path = "tests/data/pdf/" doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs) corpus_parser = Parser( session, parallelism=PARALLEL, structural=True, lingual=True, visual=True, pdf_path=pdf_path, ) corpus_parser.apply(doc_preprocessor) assert session.query(Document).count() == max_docs num_docs = session.query(Document).count() logger.info(f"Docs: {num_docs}") assert num_docs == max_docs num_sentences = session.query(Sentence).count() logger.info(f"Sentences: {num_sentences}") # Divide into test and train docs = sorted(corpus_parser.get_documents()) last_docs = sorted(corpus_parser.get_last_documents()) ld = len(docs) assert ld == len(last_docs) assert len(docs[0].sentences) == len(last_docs[0].sentences) assert len(docs[0].sentences) == 799 assert len(docs[1].sentences) == 663 assert len(docs[2].sentences) == 784 assert len(docs[3].sentences) == 661 assert len(docs[4].sentences) == 513 assert len(docs[5].sentences) == 700 assert len(docs[6].sentences) == 528 assert len(docs[7].sentences) == 161 assert len(docs[8].sentences) == 228 assert len(docs[9].sentences) == 511 assert len(docs[10].sentences) == 331 assert len(docs[11].sentences) == 528 # Check table numbers assert len(docs[0].tables) == 9 assert len(docs[1].tables) == 9 assert len(docs[2].tables) == 14 assert len(docs[3].tables) == 11 assert len(docs[4].tables) == 11 assert len(docs[5].tables) == 10 assert len(docs[6].tables) == 10 assert len(docs[7].tables) == 2 assert len(docs[8].tables) == 7 assert len(docs[9].tables) == 10 assert len(docs[10].tables) == 6 assert len(docs[11].tables) == 9 # Check figure numbers assert len(docs[0].figures) == 32 assert len(docs[1].figures) == 11 assert len(docs[2].figures) == 38 assert len(docs[3].figures) == 31 assert len(docs[4].figures) == 7 assert len(docs[5].figures) == 38 assert len(docs[6].figures) == 10 assert len(docs[7].figures) == 31 assert len(docs[8].figures) == 4 assert len(docs[9].figures) == 27 assert len(docs[10].figures) == 5 assert len(docs[11].figures) == 27 # Check caption numbers assert len(docs[0].captions) == 0 assert len(docs[1].captions) == 0 assert len(docs[2].captions) == 0 assert len(docs[3].captions) == 0 assert len(docs[4].captions) == 0 assert len(docs[5].captions) == 0 assert len(docs[6].captions) == 0 assert len(docs[7].captions) == 0 assert len(docs[8].captions) == 0 assert len(docs[9].captions) == 0 assert len(docs[10].captions) == 0 assert len(docs[11].captions) == 0 train_docs = set() dev_docs = set() test_docs = set() splits = (0.5, 0.75) data = [(doc.name, doc) for doc in docs] data.sort(key=lambda x: x[0]) for i, (doc_name, doc) in enumerate(data): if i < splits[0] * ld: train_docs.add(doc) elif i < splits[1] * ld: dev_docs.add(doc) else: test_docs.add(doc) logger.info([x.name for x in train_docs]) # NOTE: With multi-relation support, return values of getting candidates, # mentions, or sparse matrices are formatted as a list of lists. This means # that with a single relation, we need to index into the list of lists to # get the candidates/mentions/sparse matrix for a particular relation or # mention. # Mention Extraction part_ngrams = MentionNgramsPart(parts_by_doc=None, n_max=3) temp_ngrams = MentionNgramsTemp(n_max=2) volt_ngrams = MentionNgramsVolt(n_max=1) Part = mention_subclass("Part") Temp = mention_subclass("Temp") Volt = mention_subclass("Volt") mention_extractor = MentionExtractor( session, [Part, Temp, Volt], [part_ngrams, temp_ngrams, volt_ngrams], [part_matcher, temp_matcher, volt_matcher], ) mention_extractor.apply(docs, parallelism=PARALLEL) assert session.query(Part).count() == 299 assert session.query(Temp).count() == 138 assert session.query(Volt).count() == 140 assert len(mention_extractor.get_mentions()) == 3 assert len(mention_extractor.get_mentions()[0]) == 299 assert (len( mention_extractor.get_mentions(docs=[ session.query(Document).filter(Document.name == "112823").first() ])[0]) == 70) # Candidate Extraction PartTemp = candidate_subclass("PartTemp", [Part, Temp]) PartVolt = candidate_subclass("PartVolt", [Part, Volt]) candidate_extractor = CandidateExtractor( session, [PartTemp, PartVolt], throttlers=[temp_throttler, volt_throttler]) for i, docs in enumerate([train_docs, dev_docs, test_docs]): candidate_extractor.apply(docs, split=i, parallelism=PARALLEL) assert session.query(PartTemp).filter(PartTemp.split == 0).count() == 3493 assert session.query(PartTemp).filter(PartTemp.split == 1).count() == 61 assert session.query(PartTemp).filter(PartTemp.split == 2).count() == 416 assert session.query(PartVolt).count() == 4282 # Grab candidate lists train_cands = candidate_extractor.get_candidates(split=0, sort=True) dev_cands = candidate_extractor.get_candidates(split=1, sort=True) test_cands = candidate_extractor.get_candidates(split=2, sort=True) assert len(train_cands) == 2 assert len(train_cands[0]) == 3493 assert (len( candidate_extractor.get_candidates(docs=[ session.query(Document).filter(Document.name == "112823").first() ])[0]) == 1432) # Featurization featurizer = Featurizer(session, [PartTemp, PartVolt]) # Test that FeatureKey is properly reset featurizer.apply(split=1, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 214 assert session.query(FeatureKey).count() == 1260 # Test Dropping FeatureKey # Should force a row deletion featurizer.drop_keys(["DDL_e1_W_LEFT_POS_3_[NNP NN IN]"]) assert session.query(FeatureKey).count() == 1259 # Should only remove the part_volt as a relation and leave part_temp assert set( session.query(FeatureKey).filter( FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes) == { "part_temp", "part_volt" } featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt]) assert session.query(FeatureKey).filter( FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes == ["part_temp"] assert session.query(FeatureKey).count() == 1259 # Inserting the removed key featurizer.upsert_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp, PartVolt]) assert set( session.query(FeatureKey).filter( FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]").one().candidate_classes) == { "part_temp", "part_volt" } assert session.query(FeatureKey).count() == 1259 # Removing the key again featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt]) # Removing the last relation from a key should delete the row featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp]) assert session.query(FeatureKey).count() == 1258 session.query(Feature).delete(synchronize_session="fetch") session.query(FeatureKey).delete(synchronize_session="fetch") featurizer.apply(split=0, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 6478 assert session.query(FeatureKey).count() == 4538 F_train = featurizer.get_feature_matrices(train_cands) assert F_train[0].shape == (3493, 4538) assert F_train[1].shape == (2985, 4538) assert len(featurizer.get_keys()) == 4538 featurizer.apply(split=1, parallelism=PARALLEL) assert session.query(Feature).count() == 6692 assert session.query(FeatureKey).count() == 4538 F_dev = featurizer.get_feature_matrices(dev_cands) assert F_dev[0].shape == (61, 4538) assert F_dev[1].shape == (153, 4538) featurizer.apply(split=2, parallelism=PARALLEL) assert session.query(Feature).count() == 8252 assert session.query(FeatureKey).count() == 4538 F_test = featurizer.get_feature_matrices(test_cands) assert F_test[0].shape == (416, 4538) assert F_test[1].shape == (1144, 4538) gold_file = "tests/data/hardware_tutorial_gold.csv" labeler = Labeler(session, [PartTemp, PartVolt]) labeler.apply( docs=last_docs, lfs=[[gold], [gold]], table=GoldLabel, train=True, parallelism=PARALLEL, ) assert session.query(GoldLabel).count() == 8252 stg_temp_lfs = [ LF_storage_row, LF_operating_row, LF_temperature_row, LF_tstg_row, LF_to_left, LF_negative_number_left, ] ce_v_max_lfs = [ LF_bad_keywords_in_row, LF_current_in_row, LF_non_ce_voltages_in_row, ] with pytest.raises(ValueError): labeler.apply(split=0, lfs=stg_temp_lfs, train=True, parallelism=PARALLEL) labeler.apply( docs=train_docs, lfs=[stg_temp_lfs, ce_v_max_lfs], train=True, parallelism=PARALLEL, ) assert session.query(Label).count() == 6478 assert session.query(LabelKey).count() == 9 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3493, 9) assert L_train[1].shape == (2985, 9) assert len(labeler.get_keys()) == 9 # Test Dropping LabelerKey labeler.drop_keys(["LF_storage_row"]) assert len(labeler.get_keys()) == 8 # Test Upserting LabelerKey labeler.upsert_keys(["LF_storage_row"]) assert "LF_storage_row" in [label.name for label in labeler.get_keys()] L_train_gold = labeler.get_gold_labels(train_cands) assert L_train_gold[0].shape == (3493, 1) L_train_gold = labeler.get_gold_labels(train_cands, annotator="gold") assert L_train_gold[0].shape == (3493, 1) label_model = LabelModel() label_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100) train_marginals = label_model.predict_proba(L_train[0]) # Collect word counter word_counter = collect_word_counter(train_cands) emmental.init(fonduer.Meta.log_path) # Training config config = { "meta_config": { "verbose": False }, "model_config": { "model_path": None, "device": 0, "dataparallel": False }, "learner_config": { "n_epochs": 5, "optimizer_config": { "lr": 0.001, "l2": 0.0 }, "task_scheduler": "round_robin", }, "logging_config": { "evaluation_freq": 1, "counter_unit": "epoch", "checkpointing": False, "checkpointer_config": { "checkpoint_metric": { f"{ATTRIBUTE}/{ATTRIBUTE}/train/loss": "min" }, "checkpoint_freq": 1, "checkpoint_runway": 2, "clear_intermediate_checkpoints": True, "clear_all_checkpoints": True, }, }, } emmental.Meta.update_config(config=config) # Generate word embedding module arity = 2 # Geneate special tokens specials = [] for i in range(arity): specials += [f"~~[[{i}", f"{i}]]~~"] emb_layer = EmbeddingModule(word_counter=word_counter, word_dim=300, specials=specials) diffs = train_marginals.max(axis=1) - train_marginals.min(axis=1) train_idxs = np.where(diffs > 1e-6)[0] train_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset( ATTRIBUTE, train_cands[0], F_train[0], emb_layer.word2id, train_marginals, train_idxs, ), split="train", batch_size=100, shuffle=True, ) tasks = create_task(ATTRIBUTE, 2, F_train[0].shape[1], 2, emb_layer, model="LogisticRegression") model = EmmentalModel(name=f"{ATTRIBUTE}_task") for task in tasks: model.add_task(task) emmental_learner = EmmentalLearner() emmental_learner.learn(model, [train_dataloader]) test_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset(ATTRIBUTE, test_cands[0], F_test[0], emb_layer.word2id, 2), split="test", batch_size=100, shuffle=False, ) test_preds = model.predict(test_dataloader, return_preds=True) positive = np.where( np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.6) true_pred = [test_cands[0][_] for _ in positive[0]] pickle_file = "tests/data/parts_by_doc_dict.pkl" with open(pickle_file, "rb") as f: parts_by_doc = pickle.load(f) (TP, FP, FN) = entity_level_f1(true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 < 0.7 and f1 > 0.3 stg_temp_lfs_2 = [ LF_to_left, LF_test_condition_aligned, LF_collector_aligned, LF_current_aligned, LF_voltage_row_temp, LF_voltage_row_part, LF_typ_row, LF_complement_left_row, LF_too_many_numbers_row, LF_temp_on_high_page_num, LF_temp_outside_table, LF_not_temp_relevant, ] labeler.update(split=0, lfs=[stg_temp_lfs_2, ce_v_max_lfs], parallelism=PARALLEL) assert session.query(Label).count() == 6478 assert session.query(LabelKey).count() == 16 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3493, 16) label_model = LabelModel() label_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100) train_marginals = label_model.predict_proba(L_train[0]) diffs = train_marginals.max(axis=1) - train_marginals.min(axis=1) train_idxs = np.where(diffs > 1e-6)[0] train_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset( ATTRIBUTE, train_cands[0], F_train[0], emb_layer.word2id, train_marginals, train_idxs, ), split="train", batch_size=100, shuffle=True, ) valid_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset( ATTRIBUTE, train_cands[0], F_train[0], emb_layer.word2id, np.argmax(train_marginals, axis=1), train_idxs, ), split="valid", batch_size=100, shuffle=False, ) emmental.Meta.reset() emmental.init(fonduer.Meta.log_path) emmental.Meta.update_config(config=config) tasks = create_task(ATTRIBUTE, 2, F_train[0].shape[1], 2, emb_layer, model="LogisticRegression") model = EmmentalModel(name=f"{ATTRIBUTE}_task") for task in tasks: model.add_task(task) emmental_learner = EmmentalLearner() emmental_learner.learn(model, [train_dataloader, valid_dataloader]) test_preds = model.predict(test_dataloader, return_preds=True) positive = np.where( np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.7) true_pred = [test_cands[0][_] for _ in positive[0]] (TP, FP, FN) = entity_level_f1(true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing LSTM emmental.Meta.reset() emmental.init(fonduer.Meta.log_path) emmental.Meta.update_config(config=config) tasks = create_task(ATTRIBUTE, 2, F_train[0].shape[1], 2, emb_layer, model="LSTM") model = EmmentalModel(name=f"{ATTRIBUTE}_task") for task in tasks: model.add_task(task) emmental_learner = EmmentalLearner() emmental_learner.learn(model, [train_dataloader]) test_preds = model.predict(test_dataloader, return_preds=True) positive = np.where( np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.7) true_pred = [test_cands[0][_] for _ in positive[0]] (TP, FP, FN) = entity_level_f1(true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7
datasets[task_name][split] = EmmentalDataset( name="GLUE", X_dict=X_dict, Y_dict=Y_dict ) logger.info(f"Loaded {split} for {task_name}.") dataloaders = [] for task_name in args.task: for split in ["train", "dev", "test"]: dataloaders.append( EmmentalDataLoader( task_to_label_dict={task_name: "labels"}, dataset=datasets[task_name][split], split=split, batch_size=args.batch_size, shuffle=True if split == "train" else False, ) ) logger.info(f"Built dataloader for {task_name} {split} set.") tasks = get_gule_task(args.task, args.bert_model) mtl_model = EmmentalModel(name="GLUE_multi_task") if Meta.config["model_config"]["model_path"]: mtl_model.load(Meta.config["model_config"]["model_path"]) else: for task_name, task in tasks.items(): mtl_model.add_task(task)
# Getting size of char dict -- assume all chars appear in 1st 1000 examples! # HACK: HARD CODE THIS/SAVE IT! #char_dict_size = max([max(datasets['test'].X_dict['emb'][ii]) for ii in range(1000)])+1 char_dict_size = char_dict.len() # Creating dataloaders splits = ["test"] dataloaders = [] for split in splits: dataloaders.append( EmmentalDataLoader( task_to_label_dict={"ht_page": "label"}, dataset=datasets[split], split=split, batch_size=16, shuffle=False, )) print(f"Built dataloader for {split} set.") # Getting tasks tasks = get_task(task_names, config['embed_dim'], char_dict_size) # Build Emmental model model = EmmentalModel(name="HT", tasks=tasks) if Meta.config["model_config"]["model_path"]: print('Loading model...') model.load(Meta.config["model_config"]["model_path"])
def eval_model(model, emb_layer, cands, F, align_type="row"): # Extract candidates and features based on the align type (row/column) align_val = 0 if align_type == "row" else 1 train_cands = cands[align_val][0] dev_cands = cands[align_val][1] test_cands = cands[align_val][2] F_train = F[align_val][0] F_dev = F[align_val][1] F_test = F[align_val][2] row_on = True if align_type == "row" else False col_on = True if align_type == "col" else False # Generate dataloader for test data test_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset(ATTRIBUTE, test_cands[0], F_test[0], emb_layer.word2id, 2), split="test", batch_size=100, shuffle=False, ) test_preds = model.predict(test_dataloader, return_preds=True) positive = np.where( np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.6) true_pred = [test_cands[0][_] for _ in positive[0]] test_results = entity_level_f1(true_pred, gold_file, ATTRIBUTE, test_docs, row_on=row_on, col_on=col_on) # Run on dev and train set for validation # We run the predictions also on our training and dev set, to validate that everything seems to work smoothly # Generate dataloader for dev data dev_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset(ATTRIBUTE, dev_cands[0], F_dev[0], emb_layer.word2id, 2), split="test", batch_size=100, shuffle=False, ) dev_preds = model.predict(dev_dataloader, return_preds=True) positive_dev = np.where( np.array(dev_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.6) true_dev_pred = [dev_cands[0][_] for _ in positive_dev[0]] dev_results = entity_level_f1(true_dev_pred, gold_file, ATTRIBUTE, dev_docs, row_on=row_on, col_on=col_on) # Generate dataloader for train data train_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset(ATTRIBUTE, train_cands[0], F_train[0], emb_layer.word2id, 2), split="test", batch_size=100, shuffle=False, ) train_preds = model.predict(train_dataloader, return_preds=True) positive_train = np.where( np.array(train_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.6) true_train_pred = [train_cands[0][_] for _ in positive_train[0]] train_results = entity_level_f1(true_train_pred, gold_file, ATTRIBUTE, train_docs, row_on=row_on, col_on=col_on) return [train_results, dev_results, test_results]
def train_model(cands, F, align_type, model_type="LogisticRegression"): # Extract candidates and features based on the align type (row/column) align_val = 0 if align_type == "row" else 1 train_cands = cands[align_val][0] F_train = F[align_val][0] train_marginals = np.array([[0, 1] if gold[align_val](x) else [1, 0] for x in train_cands[0]]) # 1.) Setup training config config = { "meta_config": { "verbose": True }, "model_config": { "model_path": None, "device": 0, "dataparallel": False }, "learner_config": { "n_epochs": 50, "optimizer_config": { "lr": 0.001, "l2": 0.0 }, "task_scheduler": "round_robin", }, "logging_config": { "evaluation_freq": 1, "counter_unit": "epoch", "checkpointing": False, "checkpointer_config": { "checkpoint_metric": { f"{ATTRIBUTE}/{ATTRIBUTE}/train/loss": "min" }, "checkpoint_freq": 1, "checkpoint_runway": 2, "clear_intermediate_checkpoints": True, "clear_all_checkpoints": True, }, }, } emmental.init(Meta.log_path) emmental.Meta.update_config(config=config) # 2.) Collect word counter from training data word_counter = collect_word_counter(train_cands) # 3.) Generate word embedding module for LSTM model # (in Logistic Regression, we generate it since Fonduer dataset requires word2id dict) # Geneate special tokens arity = 2 specials = [] for i in range(arity): specials += [f"~~[[{i}", f"{i}]]~~"] emb_layer = EmbeddingModule(word_counter=word_counter, word_dim=300, specials=specials) # 4.) Generate dataloader for training set # No noise in Gold labels train_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset( ATTRIBUTE, train_cands[0], F_train[0], emb_layer.word2id, train_marginals, ), split="train", batch_size=100, shuffle=True, ) # 5.) Training tasks = create_task( ATTRIBUTE, 2, F_train[0].shape[1], 2, emb_layer, model=model_type # "LSTM" ) model = EmmentalModel(name=f"{ATTRIBUTE}_task") for task in tasks: model.add_task(task) emmental_learner = EmmentalLearner() emmental_learner.learn(model, [train_dataloader]) return (model, emb_layer)
def eval_model(model, emb_layer, cands, F, schema_filter=False): # Extract candidates and features train_cands = cands[0] dev_cands = cands[1] test_cands = cands[2] F_train = F[0] F_dev = F[1] F_test = F[2] # apply schema filter def apply(cands): return schema_match_filter( cands, "station", "price", price_col_keywords, stations_mapping_dict, 0.05, DEBUG, ) # Generate dataloader for test data test_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset( ATTRIBUTE, test_cands[0], F_test[0], emb_layer.word2id, 2 ), split="test", batch_size=100, shuffle=False, ) test_preds = model.predict(test_dataloader, return_preds=True) positive = np.where(np.array(test_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.6) true_pred = [test_cands[0][_] for _ in positive[0]] true_pred = apply(true_pred) if schema_filter else true_pred test_results = entity_level_f1(true_pred, gold_file, ATTRIBUTE, test_docs, stations_mapping_dict=stations_mapping_dict) # Run on dev and train set for validation # We run the predictions also on our training and dev set, to validate that everything seems to work smoothly # Generate dataloader for dev data dev_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset( ATTRIBUTE, dev_cands[0], F_dev[0], emb_layer.word2id, 2 ), split="test", batch_size=100, shuffle=False, ) dev_preds = model.predict(dev_dataloader, return_preds=True) positive_dev = np.where(np.array(dev_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.6) true_dev_pred = [dev_cands[0][_] for _ in positive_dev[0]] true_dev_pred = apply(true_dev_pred) if schema_filter else true_dev_pred dev_results = entity_level_f1(true_dev_pred, gold_file, ATTRIBUTE, dev_docs, stations_mapping_dict=stations_mapping_dict) # Generate dataloader for train data train_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset( ATTRIBUTE, train_cands[0], F_train[0], emb_layer.word2id, 2 ), split="test", batch_size=100, shuffle=False, ) train_preds = model.predict(train_dataloader, return_preds=True) positive_train = np.where(np.array(train_preds["probs"][ATTRIBUTE])[:, TRUE] > 0.6) true_train_pred = [train_cands[0][_] for _ in positive_train[0]] true_train_pred = apply(true_train_pred) if schema_filter else true_train_pred train_results = entity_level_f1(true_train_pred, gold_file, ATTRIBUTE, train_docs, stations_mapping_dict=stations_mapping_dict) return [train_results, dev_results, test_results]
def get_dataloaders( args, tasks, splits, entity_symbols, batch_on_the_fly_kg_adj, ): """Gets the dataloaders. Args: args: main args tasks: task names splits: data splits to generate dataloaders for entity_symbols: entity symbols batch_on_the_fly_kg_adj: kg embeddings metadata for the __get_item__ method (see get_dataloader_embeddings) Returns: list of dataloaders """ task_to_label_dict = {t: NED_TASK_TO_LABEL[t] for t in tasks} is_bert = len(args.data_config.word_embedding.bert_model) > 0 tokenizer = BertTokenizer.from_pretrained( args.data_config.word_embedding.bert_model, do_lower_case=True if "uncased" in args.data_config.word_embedding.bert_model else False, cache_dir=args.data_config.word_embedding.cache_dir, ) datasets = {} for split in splits: dataset_path = os.path.join(args.data_config.data_dir, args.data_config[f"{split}_dataset"].file) datasets[split] = BootlegDataset( main_args=args, name=f"Bootleg", dataset=dataset_path, use_weak_label=args.data_config[f"{split}_dataset"].use_weak_label, tokenizer=tokenizer, entity_symbols=entity_symbols, dataset_threads=args.run_config.dataset_threads, split=split, is_bert=is_bert, batch_on_the_fly_kg_adj=batch_on_the_fly_kg_adj, ) dataloaders = [] for split, dataset in datasets.items(): if split in args.learner_config.train_split: dataset_sampler = (RandomSampler(dataset) if Meta.config["learner_config"]["local_rank"] == -1 else DistributedSampler(dataset)) else: dataset_sampler = None if Meta.config["learner_config"]["local_rank"] != -1: log_rank_0_info( logger, f"You are using distributed computing for eval. We are not using a distributed sampler. " f"Please use DataParallel and not DDP.", ) dataloaders.append( EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=dataset, sampler=dataset_sampler, split=split, collate_fn=bootleg_collate_fn, batch_size=args.train_config.batch_size if split in args.learner_config.train_split or args.run_config.eval_batch_size is None else args.run_config.eval_batch_size, num_workers=args.run_config.dataloader_threads, pin_memory=False, )) log_rank_0_info( logger, f"Built dataloader for {split} set with {len(dataset)} and {args.run_config.dataloader_threads} threads " f"samples (Shuffle={split in args.learner_config.train_split}, " f"Batch size={dataloaders[-1].batch_size}).", ) return dataloaders
def test_e2e(caplog): """Run an end-to-end test.""" caplog.set_level(logging.INFO) dirpath = "temp_test_e2e" Meta.reset() emmental.init(dirpath) # Generate synthetic data N = 50 X = np.random.random((N, 2)) * 2 - 1 Y1 = (X[:, 0] > X[:, 1] + 0.25).astype(int) + 1 Y2 = (-X[:, 0] > X[:, 1] + 0.25).astype(int) + 1 # Create dataset and dataloader splits = [0.8, 0.1, 0.1] X_train, X_dev, X_test = [], [], [] Y1_train, Y1_dev, Y1_test = [], [], [] Y2_train, Y2_dev, Y2_test = [], [], [] for i in range(N): if i <= N * splits[0]: X_train.append(torch.Tensor(X[i])) Y1_train.append(Y1[i]) Y2_train.append(Y2[i]) elif i < N * (splits[0] + splits[1]): X_dev.append(torch.Tensor(X[i])) Y1_dev.append(Y1[i]) Y2_dev.append(Y2[i]) else: X_test.append(torch.Tensor(X[i])) Y1_test.append(Y1[i]) Y2_test.append(Y2[i]) Y1_train = torch.from_numpy(np.array(Y1_train)) Y1_dev = torch.from_numpy(np.array(Y1_dev)) Y1_test = torch.from_numpy(np.array(Y1_test)) Y2_train = torch.from_numpy(np.array(Y1_train)) Y2_dev = torch.from_numpy(np.array(Y2_dev)) Y2_test = torch.from_numpy(np.array(Y2_test)) train_dataset1 = EmmentalDataset( name="synthetic", X_dict={"data": X_train}, Y_dict={"label1": Y1_train} ) train_dataset2 = EmmentalDataset( name="synthetic", X_dict={"data": X_train}, Y_dict={"label2": Y2_train} ) dev_dataset1 = EmmentalDataset( name="synthetic", X_dict={"data": X_dev}, Y_dict={"label1": Y1_dev} ) dev_dataset2 = EmmentalDataset( name="synthetic", X_dict={"data": X_dev}, Y_dict={"label2": Y2_dev} ) test_dataset1 = EmmentalDataset( name="synthetic", X_dict={"data": X_test}, Y_dict={"label1": Y2_test} ) test_dataset2 = EmmentalDataset( name="synthetic", X_dict={"data": X_test}, Y_dict={"label2": Y2_test} ) task_to_label_dict = {"task1": "label1"} train_dataloader1 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=train_dataset1, split="train", batch_size=10, ) dev_dataloader1 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=dev_dataset1, split="valid", batch_size=10, ) test_dataloader1 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=test_dataset1, split="test", batch_size=10, ) task_to_label_dict = {"task2": "label2"} train_dataloader2 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=train_dataset2, split="train", batch_size=10, ) dev_dataloader2 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=dev_dataset2, split="valid", batch_size=10, ) test_dataloader2 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=test_dataset2, split="test", batch_size=10, ) # Create task def ce_loss(task_name, immediate_ouput_dict, Y, active): module_name = f"{task_name}_pred_head" return F.cross_entropy( immediate_ouput_dict[module_name][0][active], (Y.view(-1) - 1)[active] ) def output(task_name, immediate_ouput_dict): module_name = f"{task_name}_pred_head" return F.softmax(immediate_ouput_dict[module_name][0], dim=1) task_name = "task1" task1 = EmmentalTask( name=task_name, module_pool=nn.ModuleDict( {"input_module": nn.Linear(2, 8), f"{task_name}_pred_head": nn.Linear(8, 2)} ), task_flow=[ { "name": "input", "module": "input_module", "inputs": [("_input_", "data")], }, { "name": f"{task_name}_pred_head", "module": f"{task_name}_pred_head", "inputs": [("input", 0)], }, ], loss_func=partial(ce_loss, task_name), output_func=partial(output, task_name), scorer=Scorer(metrics=["accuracy", "roc_auc"]), ) task_name = "task2" task2 = EmmentalTask( name=task_name, module_pool=nn.ModuleDict( {"input_module": nn.Linear(2, 8), f"{task_name}_pred_head": nn.Linear(8, 2)} ), task_flow=[ { "name": "input", "module": "input_module", "inputs": [("_input_", "data")], }, { "name": f"{task_name}_pred_head", "module": f"{task_name}_pred_head", "inputs": [("input", 0)], }, ], loss_func=partial(ce_loss, task_name), output_func=partial(output, task_name), scorer=Scorer(metrics=["accuracy", "roc_auc"]), ) # Build model mtl_model = EmmentalModel(name="all", tasks=[task1, task2]) # Create learner emmental_learner = EmmentalLearner() # Update learning config Meta.update_config( config={"learner_config": {"n_epochs": 10, "optimizer_config": {"lr": 0.01}}} ) # Learning emmental_learner.learn( mtl_model, [train_dataloader1, train_dataloader2, dev_dataloader1, dev_dataloader2], ) test1_score = mtl_model.score(test_dataloader1) test2_score = mtl_model.score(test_dataloader2) assert test1_score["task1/synthetic/test/accuracy"] >= 0.5 assert test1_score["task1/synthetic/test/roc_auc"] >= 0.6 assert test2_score["task2/synthetic/test/accuracy"] >= 0.5 assert test2_score["task2/synthetic/test/roc_auc"] >= 0.6 shutil.rmtree(dirpath)
def test_e2e(caplog): """Run an end-to-end test.""" caplog.set_level(logging.INFO) dirpath = "temp_test_e2e" use_exact_log_path = False Meta.reset() emmental.init(dirpath, use_exact_log_path=use_exact_log_path) config = { "meta_config": { "seed": 0 }, "learner_config": { "n_epochs": 3, "optimizer_config": { "lr": 0.01, "grad_clip": 100 }, }, "logging_config": { "counter_unit": "epoch", "evaluation_freq": 1, "writer_config": { "writer": "tensorboard", "verbose": True }, "checkpointing": True, "checkpointer_config": { "checkpoint_path": None, "checkpoint_freq": 1, "checkpoint_metric": { "model/all/train/loss": "min" }, "checkpoint_task_metrics": None, "checkpoint_runway": 1, "checkpoint_all": False, "clear_intermediate_checkpoints": True, "clear_all_checkpoints": True, }, }, } emmental.Meta.update_config(config) # Generate synthetic data N = 500 X = np.random.random((N, 2)) * 2 - 1 Y1 = (X[:, 0] > X[:, 1] + 0.25).astype(int) Y2 = (X[:, 0] > X[:, 1] + 0.2).astype(int) X = [torch.Tensor(X[i]) for i in range(N)] # Create dataset and dataloader X_train, X_dev, X_test = ( X[:int(0.8 * N)], X[int(0.8 * N):int(0.9 * N)], X[int(0.9 * N):], ) Y1_train, Y1_dev, Y1_test = ( torch.tensor(Y1[:int(0.8 * N)]), torch.tensor(Y1[int(0.8 * N):int(0.9 * N)]), torch.tensor(Y1[int(0.9 * N):]), ) Y2_train, Y2_dev, Y2_test = ( torch.tensor(Y2[:int(0.8 * N)]), torch.tensor(Y2[int(0.8 * N):int(0.9 * N)]), torch.tensor(Y2[int(0.9 * N):]), ) train_dataset1 = EmmentalDataset(name="synthetic", X_dict={"data": X_train}, Y_dict={"label1": Y1_train}) train_dataset2 = EmmentalDataset(name="synthetic", X_dict={"data": X_train}, Y_dict={"label2": Y2_train}) dev_dataset1 = EmmentalDataset(name="synthetic", X_dict={"data": X_dev}, Y_dict={"label1": Y1_dev}) dev_dataset2 = EmmentalDataset(name="synthetic", X_dict={"data": X_dev}, Y_dict={"label2": Y2_dev}) test_dataset1 = EmmentalDataset(name="synthetic", X_dict={"data": X_test}, Y_dict={"label1": Y1_test}) test_dataset2 = EmmentalDataset(name="synthetic", X_dict={"data": X_test}, Y_dict={"label2": Y2_test}) task_to_label_dict = {"task1": "label1"} train_dataloader1 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=train_dataset1, split="train", batch_size=10, ) dev_dataloader1 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=dev_dataset1, split="valid", batch_size=10, ) test_dataloader1 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=test_dataset1, split="test", batch_size=10, ) task_to_label_dict = {"task2": "label2"} train_dataloader2 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=train_dataset2, split="train", batch_size=10, ) dev_dataloader2 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=dev_dataset2, split="valid", batch_size=10, ) test_dataloader2 = EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=test_dataset2, split="test", batch_size=10, ) # Create task def ce_loss(task_name, immediate_ouput_dict, Y, active): module_name = f"{task_name}_pred_head" return F.cross_entropy(immediate_ouput_dict[module_name][0][active], (Y.view(-1))[active]) def output(task_name, immediate_ouput_dict): module_name = f"{task_name}_pred_head" return F.softmax(immediate_ouput_dict[module_name][0], dim=1) task_metrics = {"task1": ["accuracy"], "task2": ["accuracy", "roc_auc"]} tasks = [ EmmentalTask( name=task_name, module_pool=nn.ModuleDict({ "input_module": nn.Linear(2, 8), f"{task_name}_pred_head": nn.Linear(8, 2), }), task_flow=[ { "name": "input", "module": "input_module", "inputs": [("_input_", "data")], }, { "name": f"{task_name}_pred_head", "module": f"{task_name}_pred_head", "inputs": [("input", 0)], }, ], loss_func=partial(ce_loss, task_name), output_func=partial(output, task_name), scorer=Scorer(metrics=task_metrics[task_name]), ) for task_name in ["task1", "task2"] ] # Build model mtl_model = EmmentalModel(name="all", tasks=tasks) # Create learner emmental_learner = EmmentalLearner() # Learning emmental_learner.learn( mtl_model, [ train_dataloader1, train_dataloader2, dev_dataloader1, dev_dataloader2 ], ) test1_score = mtl_model.score(test_dataloader1) test2_score = mtl_model.score(test_dataloader2) assert test1_score["task1/synthetic/test/accuracy"] >= 0.7 assert (test1_score["model/all/test/macro_average"] == test1_score["task1/synthetic/test/accuracy"]) assert test2_score["task2/synthetic/test/accuracy"] >= 0.7 assert test2_score["task2/synthetic/test/roc_auc"] >= 0.7 shutil.rmtree(dirpath)
name=args.task_name, data_path=data_path, input_field=args.input_field, label_fields=args.label_fields, split=split, tokenizer=tokenizer, max_data_samples=args. max_data_samples, # if split == "train" else None, max_seq_length=args.max_seq_length, ) logger.info(f"Loaded {split} containing {len(dataset)} samples.") dataloaders.append( EmmentalDataLoader( task_to_label_dict={args.task_name: "labels"}, dataset=dataset, split=split, shuffle=True if split == "train" else False, batch_size=args.batch_size, # num_workers=8, )) logger.info(f"Built dataloader for {dataset.name} {split} set.") # Build Emmental model model = EmmentalModel(name=args.task_name, tasks=create_task(args)) # Load the pre-trained model if Meta.config["model_config"]["model_path"]: model.load(Meta.config["model_config"]["model_path"]) # Training if args.train: emmental_learner = EmmentalLearner()
def main( conn_string, gain=False, current=False, max_docs=float("inf"), parse=False, first_time=False, re_label=False, parallel=8, log_dir="logs", verbose=False, ): # Setup initial configuration if not log_dir: log_dir = "logs" if verbose: level = logging.INFO else: level = logging.WARNING dirname = os.path.dirname(os.path.abspath(__file__)) init_logging(log_dir=os.path.join(dirname, log_dir), level=level) rel_list = [] if gain: rel_list.append("gain") if current: rel_list.append("current") logger.info(f"=" * 30) logger.info(f"Running with parallel: {parallel}, max_docs: {max_docs}") session = Meta.init(conn_string).Session() # Parsing start = timer() logger.info(f"Starting parsing...") docs, train_docs, dev_docs, test_docs = parse_dataset(session, dirname, first_time=parse, parallel=parallel, max_docs=max_docs) logger.debug(f"Done") end = timer() logger.warning(f"Parse Time (min): {((end - start) / 60.0):.1f}") logger.info(f"# of Documents: {len(docs)}") logger.info(f"# of train Documents: {len(train_docs)}") logger.info(f"# of dev Documents: {len(dev_docs)}") logger.info(f"# of test Documents: {len(test_docs)}") logger.info(f"Documents: {session.query(Document).count()}") logger.info(f"Sections: {session.query(Section).count()}") logger.info(f"Paragraphs: {session.query(Paragraph).count()}") logger.info(f"Sentences: {session.query(Sentence).count()}") logger.info(f"Figures: {session.query(Figure).count()}") # Mention Extraction start = timer() mentions = [] ngrams = [] matchers = [] # Only do those that are enabled if gain: Gain = mention_subclass("Gain") gain_matcher = get_gain_matcher() gain_ngrams = MentionNgrams(n_max=2) mentions.append(Gain) ngrams.append(gain_ngrams) matchers.append(gain_matcher) if current: Current = mention_subclass("SupplyCurrent") current_matcher = get_supply_current_matcher() current_ngrams = MentionNgramsCurrent(n_max=3) mentions.append(Current) ngrams.append(current_ngrams) matchers.append(current_matcher) mention_extractor = MentionExtractor(session, mentions, ngrams, matchers) if first_time: mention_extractor.apply(docs, parallelism=parallel) logger.info(f"Total Mentions: {session.query(Mention).count()}") if gain: logger.info(f"Total Gain: {session.query(Gain).count()}") if current: logger.info(f"Total Current: {session.query(Current).count()}") cand_classes = [] if gain: GainCand = candidate_subclass("GainCand", [Gain]) cand_classes.append(GainCand) if current: CurrentCand = candidate_subclass("CurrentCand", [Current]) cand_classes.append(CurrentCand) candidate_extractor = CandidateExtractor(session, cand_classes) if first_time: for i, docs in enumerate([train_docs, dev_docs, test_docs]): candidate_extractor.apply(docs, split=i, parallelism=parallel) # These must be sorted for deterministic behavior. train_cands = candidate_extractor.get_candidates(split=0, sort=True) dev_cands = candidate_extractor.get_candidates(split=1, sort=True) test_cands = candidate_extractor.get_candidates(split=2, sort=True) logger.info( f"Total train candidate: {len(train_cands[0]) + len(train_cands[1])}") logger.info( f"Total dev candidate: {len(dev_cands[0]) + len(dev_cands[1])}") logger.info( f"Total test candidate: {len(test_cands[0]) + len(test_cands[1])}") logger.info("Done w/ candidate extraction.") end = timer() logger.warning(f"CE Time (min): {((end - start) / 60.0):.1f}") # First, check total recall # result = entity_level_scores( # candidates_to_entities(dev_cands[0], is_gain=True), # corpus=dev_docs, # is_gain=True, # ) # logger.info(f"Gain Total Dev Recall: {result.rec:.3f}") # logger.info(f"\n{pformat(result.FN)}") # result = entity_level_scores( # candidates_to_entities(test_cands[0], is_gain=True), # corpus=test_docs, # is_gain=True, # ) # logger.info(f"Gain Total Test Recall: {result.rec:.3f}") # logger.info(f"\n{pformat(result.FN)}") # # result = entity_level_scores( # candidates_to_entities(dev_cands[1], is_gain=False), # corpus=dev_docs, # is_gain=False, # ) # logger.info(f"Current Total Dev Recall: {result.rec:.3f}") # logger.info(f"\n{pformat(result.FN)}") # result = entity_level_scores( # candidates_to_entities(test_cands[1], is_gain=False), # corpus=test_docs, # is_gain=False, # ) # logger.info(f"Current Test Recall: {result.rec:.3f}") # logger.info(f"\n{pformat(result.FN)}") start = timer() # Using parallelism = 1 for deterministic behavior. featurizer = Featurizer(session, cand_classes, parallelism=1) if first_time: logger.info("Starting featurizer...") # Set feature space based on dev set, which we use for training rather # than the large train set. featurizer.apply(split=1, train=True) featurizer.apply(split=0) featurizer.apply(split=2) logger.info("Done") logger.info("Getting feature matrices...") # Serialize feature matrices on first run if first_time: F_train = featurizer.get_feature_matrices(train_cands) F_dev = featurizer.get_feature_matrices(dev_cands) F_test = featurizer.get_feature_matrices(test_cands) end = timer() logger.warning( f"Featurization Time (min): {((end - start) / 60.0):.1f}") F_train_dict = {} F_dev_dict = {} F_test_dict = {} for idx, relation in enumerate(rel_list): F_train_dict[relation] = F_train[idx] F_dev_dict[relation] = F_dev[idx] F_test_dict[relation] = F_test[idx] pickle.dump(F_train_dict, open(os.path.join(dirname, "F_train_dict.pkl"), "wb")) pickle.dump(F_dev_dict, open(os.path.join(dirname, "F_dev_dict.pkl"), "wb")) pickle.dump(F_test_dict, open(os.path.join(dirname, "F_test_dict.pkl"), "wb")) else: F_train_dict = pickle.load( open(os.path.join(dirname, "F_train_dict.pkl"), "rb")) F_dev_dict = pickle.load( open(os.path.join(dirname, "F_dev_dict.pkl"), "rb")) F_test_dict = pickle.load( open(os.path.join(dirname, "F_test_dict.pkl"), "rb")) F_train = [] F_dev = [] F_test = [] for relation in rel_list: F_train.append(F_train_dict[relation]) F_dev.append(F_dev_dict[relation]) F_test.append(F_test_dict[relation]) logger.info("Done.") start = timer() logger.info("Labeling training data...") # labeler = Labeler(session, cand_classes) # lfs = [] # if gain: # lfs.append(gain_lfs) # # if current: # lfs.append(current_lfs) # # if first_time: # logger.info("Applying LFs...") # labeler.apply(split=0, lfs=lfs, train=True, parallelism=parallel) # elif re_label: # logger.info("Re-applying LFs...") # labeler.update(split=0, lfs=lfs, parallelism=parallel) # # logger.info("Done...") # logger.info("Getting label matrices...") # L_train = labeler.get_label_matrices(train_cands) # logger.info("Done...") if first_time: marginals_dict = {} for idx, relation in enumerate(rel_list): # Manually create marginals from human annotations marginal = [] dev_gold_entities = get_gold_set(is_gain=(relation == "gain")) for c in dev_cands[idx]: flag = False for entity in cand_to_entity(c, is_gain=(relation == "gain")): if entity in dev_gold_entities: flag = True if flag: marginal.append([0.0, 1.0]) else: marginal.append([1.0, 0.0]) marginals_dict[relation] = np.array(marginal) pickle.dump(marginals_dict, open(os.path.join(dirname, "marginals_dict.pkl"), "wb")) else: marginals_dict = pickle.load( open(os.path.join(dirname, "marginals_dict.pkl"), "rb")) marginals = [] for relation in rel_list: marginals.append(marginals_dict[relation]) end = timer() logger.warning( f"Weak Supervision Time (min): {((end - start) / 60.0):.1f}") start = timer() word_counter = collect_word_counter(train_cands) # Training config config = { "meta_config": { "verbose": True, "seed": 30 }, "model_config": { "model_path": None, "device": 0, "dataparallel": False }, "learner_config": { "n_epochs": 500, "optimizer_config": { "lr": 0.001, "l2": 0.005 }, "task_scheduler": "round_robin", }, "logging_config": { "evaluation_freq": 1, "counter_unit": "epoch", "checkpointing": False, "checkpointer_config": { "checkpoint_metric": { "model/all/train/loss": "min" }, "checkpoint_freq": 1, "checkpoint_runway": 2, "clear_intermediate_checkpoints": True, "clear_all_checkpoints": True, }, }, } emmental.init(log_dir=Meta.log_path, config=config) # Generate word embedding module arity = 2 # Geneate special tokens specials = [] for i in range(arity): specials += [f"~~[[{i}", f"{i}]]~~"] emb_layer = EmbeddingModule(word_counter=word_counter, word_dim=300, specials=specials) train_idxs = [] train_dataloader = [] for idx, relation in enumerate(rel_list): diffs = marginals[idx].max(axis=1) - marginals[idx].min(axis=1) train_idxs.append(np.where(diffs > 1e-6)[0]) # only uses dev set as training data, with human annotations train_dataloader.append( EmmentalDataLoader( task_to_label_dict={relation: "labels"}, dataset=FonduerDataset( relation, dev_cands[idx], F_dev[idx], emb_layer.word2id, marginals[idx], train_idxs[idx], ), split="train", batch_size=256, shuffle=True, )) num_feature_keys = len(featurizer.get_keys()) model = EmmentalModel(name=f"opamp_tasks") # List relation names, arities, list of classes tasks = create_task( rel_list, [2] * len(rel_list), num_feature_keys, [2] * len(rel_list), emb_layer, model="LogisticRegression", ) for task in tasks: model.add_task(task) emmental_learner = EmmentalLearner() # If given a list of multi, will train on multiple emmental_learner.learn(model, train_dataloader) # List of dataloader for each relation for idx, relation in enumerate(rel_list): test_dataloader = EmmentalDataLoader( task_to_label_dict={relation: "labels"}, dataset=FonduerDataset(relation, test_cands[idx], F_test[idx], emb_layer.word2id, 2), split="test", batch_size=256, shuffle=False, ) test_preds = model.predict(test_dataloader, return_preds=True) best_result, best_b = scoring( test_preds, test_cands[idx], test_docs, is_gain=(relation == "gain"), num=100, ) # Dump CSV files for analysis if relation == "gain": train_dataloader = EmmentalDataLoader( task_to_label_dict={relation: "labels"}, dataset=FonduerDataset(relation, train_cands[idx], F_train[idx], emb_layer.word2id, 2), split="train", batch_size=256, shuffle=False, ) train_preds = model.predict(train_dataloader, return_preds=True) Y_prob = np.array(train_preds["probs"][relation])[:, TRUE] output_csv(train_cands[idx], Y_prob, is_gain=True) Y_prob = np.array(test_preds["probs"][relation])[:, TRUE] output_csv(test_cands[idx], Y_prob, is_gain=True, append=True) dump_candidates(test_cands[idx], Y_prob, "gain_test_probs.csv", is_gain=True) dev_dataloader = EmmentalDataLoader( task_to_label_dict={relation: "labels"}, dataset=FonduerDataset(relation, dev_cands[idx], F_dev[idx], emb_layer.word2id, 2), split="dev", batch_size=256, shuffle=False, ) dev_preds = model.predict(dev_dataloader, return_preds=True) Y_prob = np.array(dev_preds["probs"][relation])[:, TRUE] output_csv(dev_cands[idx], Y_prob, is_gain=True, append=True) dump_candidates(dev_cands[idx], Y_prob, "gain_dev_probs.csv", is_gain=True) if relation == "current": train_dataloader = EmmentalDataLoader( task_to_label_dict={relation: "labels"}, dataset=FonduerDataset(relation, train_cands[idx], F_train[idx], emb_layer.word2id, 2), split="train", batch_size=256, shuffle=False, ) train_preds = model.predict(train_dataloader, return_preds=True) Y_prob = np.array(train_preds["probs"][relation])[:, TRUE] output_csv(train_cands[idx], Y_prob, is_gain=False) Y_prob = np.array(test_preds["probs"][relation])[:, TRUE] output_csv(test_cands[idx], Y_prob, is_gain=False, append=True) dump_candidates(test_cands[idx], Y_prob, "current_test_probs.csv", is_gain=False) dev_dataloader = EmmentalDataLoader( task_to_label_dict={relation: "labels"}, dataset=FonduerDataset(relation, dev_cands[idx], F_dev[idx], emb_layer.word2id, 2), split="dev", batch_size=256, shuffle=False, ) dev_preds = model.predict(dev_dataloader, return_preds=True) Y_prob = np.array(dev_preds["probs"][relation])[:, TRUE] output_csv(dev_cands[idx], Y_prob, is_gain=False, append=True) dump_candidates(dev_cands[idx], Y_prob, "current_dev_probs.csv", is_gain=False) end = timer() logger.warning( f"Classification AND dump data Time (min): {((end - start) / 60.0):.1f}" )
task_list = args.tasks for task in task_list: assert(task in all_tasks) task_to_label_dict = {task_name: task_name for task_name in task_list} print(task_to_label_dict) # Building dataloaders dataloaders = [] for split in ["train", "val", "test"]: dataloaders.append( EmmentalDataLoader( task_to_label_dict=task_to_label_dict, dataset=datasets[split], split=split, shuffle=True if split == "train" else False, batch_size=BATCH_SIZES[split], num_workers=8, ) ) logger.info(f"Built dataloader for {datasets[split].name} {split} set.") # Building Emmental tasks input_shape = (3, 224, 224) cnn_module = TorchVisionEncoder(CNN_ENCODER, pretrained=True) classification_layer_dim = cnn_module.get_frm_output_size(input_shape) tasks = [ EmmentalTask(
def test_emmental_dataloader(caplog): """Unit test of emmental dataloader.""" caplog.set_level(logging.INFO) x1 = [ torch.Tensor([1]), torch.Tensor([1, 2]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3, 4, 5]), ] y1 = torch.Tensor([0, 0, 0, 0, 0]) x2 = [ torch.Tensor([1, 2, 3, 4, 5]), torch.Tensor([1, 2, 3, 4]), torch.Tensor([1, 2, 3]), torch.Tensor([1, 2]), torch.Tensor([1]), ] y2 = torch.Tensor([1, 1, 1, 1, 1]) dataset = EmmentalDataset( X_dict={"data1": x1, "data2": x2}, Y_dict={"label1": y1, "label2": y2}, name="new_data", ) dataloader1 = EmmentalDataLoader( task_to_label_dict={"task1": "label1"}, dataset=dataset, split="train", batch_size=2, ) x_batch, y_batch = next(iter(dataloader1)) # Check if the dataloader is correctly constructed assert dataloader1.task_to_label_dict == {"task1": "label1"} assert dataloader1.split == "train" assert torch.equal(x_batch["data1"], torch.Tensor([[1, 0], [1, 2]])) assert torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]]) ) assert torch.equal(y_batch["label1"], torch.Tensor([0, 0])) assert torch.equal(y_batch["label2"], torch.Tensor([1, 1])) dataloader2 = EmmentalDataLoader( task_to_label_dict={"task2": "label2"}, dataset=dataset, split="test", batch_size=3, ) x_batch, y_batch = next(iter(dataloader2)) # Check if the dataloader with differet batch size is correctly constructed assert dataloader2.task_to_label_dict == {"task2": "label2"} assert dataloader2.split == "test" assert torch.equal( x_batch["data1"], torch.Tensor([[1, 0, 0], [1, 2, 0], [1, 2, 3]]) ) assert torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]), ) assert torch.equal(y_batch["label1"], torch.Tensor([0, 0, 0])) assert torch.equal(y_batch["label2"], torch.Tensor([1, 1, 1])) y3 = [ torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), torch.Tensor([2]), ] dataset.Y_dict["label2"] = y3 x_batch, y_batch = next(iter(dataloader1)) # Check dataloader is correctly updated with update dataset assert torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0]]) ) assert torch.equal(y_batch["label2"], torch.Tensor([[2], [2]])) x_batch, y_batch = next(iter(dataloader2)) assert torch.equal( x_batch["data2"], torch.Tensor([[1, 2, 3, 4, 5], [1, 2, 3, 4, 0], [1, 2, 3, 0, 0]]), ) assert torch.equal(y_batch["label2"], torch.Tensor([[2], [2], [2]]))
def main( conn_string, max_docs=float("inf"), parse=False, first_time=False, gpu=None, parallel=4, log_dir=None, verbose=False, ): if not log_dir: log_dir = "logs" if verbose: level = logging.INFO else: level = logging.WARNING dirname = os.path.dirname(os.path.abspath(__file__)) init_logging(log_dir=os.path.join(dirname, log_dir), level=level) session = Meta.init(conn_string).Session() # Parsing logger.info(f"Starting parsing...") start = timer() docs, train_docs, dev_docs, test_docs = parse_dataset( session, dirname, first_time=first_time, parallel=parallel, max_docs=max_docs ) end = timer() logger.warning(f"Parse Time (min): {((end - start) / 60.0):.1f}") logger.info(f"# of train Documents: {len(train_docs)}") logger.info(f"# of dev Documents: {len(dev_docs)}") logger.info(f"# of test Documents: {len(test_docs)}") logger.info(f"Documents: {session.query(Document).count()}") logger.info(f"Sections: {session.query(Section).count()}") logger.info(f"Paragraphs: {session.query(Paragraph).count()}") logger.info(f"Sentences: {session.query(Sentence).count()}") logger.info(f"Figures: {session.query(Figure).count()}") start = timer() Thumbnails = mention_subclass("Thumbnails") thumbnails_img = MentionFigures() class HasFigures(_Matcher): def _f(self, m): file_path = "" for prefix in [ f"{dirname}/data/train/html/", f"{dirname}/data/dev/html/", f"{dirname}/data/test/html/", ]: if os.path.exists(prefix + m.figure.url): file_path = prefix + m.figure.url if file_path == "": return False img = Image.open(file_path) width, height = img.size min_value = min(width, height) return min_value > 50 mention_extractor = MentionExtractor( session, [Thumbnails], [thumbnails_img], [HasFigures()], parallelism=parallel ) if first_time: mention_extractor.apply(docs) logger.info("Total Mentions: {}".format(session.query(Mention).count())) ThumbnailLabel = candidate_subclass("ThumbnailLabel", [Thumbnails]) candidate_extractor = CandidateExtractor( session, [ThumbnailLabel], throttlers=[None], parallelism=parallel ) if first_time: candidate_extractor.apply(train_docs, split=0) candidate_extractor.apply(dev_docs, split=1) candidate_extractor.apply(test_docs, split=2) train_cands = candidate_extractor.get_candidates(split=0) # Sort the dev_cands, which are used for training, for deterministic behavior dev_cands = candidate_extractor.get_candidates(split=1, sort=True) test_cands = candidate_extractor.get_candidates(split=2) end = timer() logger.warning(f"Candidate Extraction Time (min): {((end - start) / 60.0):.1f}") logger.info("Total train candidate:\t{}".format(len(train_cands[0]))) logger.info("Total dev candidate:\t{}".format(len(dev_cands[0]))) logger.info("Total test candidate:\t{}".format(len(test_cands[0]))) fin = open(f"{dirname}/data/ground_truth.txt", "r") gt = set() for line in fin: gt.add("::".join(line.lower().split())) fin.close() # Labeling start = timer() def LF_gt_label(c): doc_file_id = ( f"{c[0].context.figure.document.name.lower()}.pdf::" f"{os.path.basename(c[0].context.figure.url.lower())}" ) return TRUE if doc_file_id in gt else FALSE gt_dev = [LF_gt_label(cand) for cand in dev_cands[0]] gt_test = [LF_gt_label(cand) for cand in test_cands[0]] end = timer() logger.warning(f"Supervision Time (min): {((end - start) / 60.0):.1f}") batch_size = 64 input_size = 224 K = 2 emmental.init(log_dir=Meta.log_path, config=emmental_config) emmental.Meta.config["learner_config"]["task_scheduler_config"][ "task_scheduler" ] = DauphinScheduler(augment_k=K, enlarge=1) train_dataset = ThumbnailDataset( "Thumbnail", dev_cands[0], gt_dev, "train", prob_label=True, prefix=f"{dirname}/data/dev/html/", input_size=input_size, transform_cls=Augmentation(2), k=K, ) val_dataset = ThumbnailDataset( "Thumbnail", dev_cands[0], gt_dev, "valid", prob_label=False, prefix=f"{dirname}/data/dev/html/", input_size=input_size, k=1, ) test_dataset = ThumbnailDataset( "Thumbnail", test_cands[0], gt_test, "test", prob_label=False, prefix=f"{dirname}/data/test/html/", input_size=input_size, k=1, ) dataloaders = [] dataloaders.append( EmmentalDataLoader( task_to_label_dict={"Thumbnail": "labels"}, dataset=train_dataset, split="train", shuffle=True, batch_size=batch_size, num_workers=1, ) ) dataloaders.append( EmmentalDataLoader( task_to_label_dict={"Thumbnail": "labels"}, dataset=val_dataset, split="valid", shuffle=False, batch_size=batch_size, num_workers=1, ) ) dataloaders.append( EmmentalDataLoader( task_to_label_dict={"Thumbnail": "labels"}, dataset=test_dataset, split="test", shuffle=False, batch_size=batch_size, num_workers=1, ) ) model = EmmentalModel(name=f"Thumbnail") model.add_task( create_task("Thumbnail", n_class=2, model="resnet18", pretrained=True) ) emmental_learner = EmmentalLearner() emmental_learner.learn(model, dataloaders) scores = model.score(dataloaders) logger.warning("Model Score:") logger.warning(f"precision: {scores['Thumbnail/Thumbnail/test/precision']:.3f}") logger.warning(f"recall: {scores['Thumbnail/Thumbnail/test/recall']:.3f}") logger.warning(f"f1: {scores['Thumbnail/Thumbnail/test/f1']:.3f}")
specials += [f"~~[[{i}", f"{i}]]~~"] emb_layer = EmbeddingModule( word_counter=word_counter, word_dim=300, specials=specials ) diffs = train_marginals.max(axis=1) - train_marginals.min(axis=1) train_idxs = np.where(diffs > 1e-6)[0] train_dataloader = EmmentalDataLoader( task_to_label_dict={ATTRIBUTE: "labels"}, dataset=FonduerDataset( ATTRIBUTE, train_cands[0], F_train[0], emb_layer.word2id, train_marginals, train_idxs, ), split="train", batch_size=100, shuffle=True, ) tasks = create_task( ATTRIBUTE, 2, F_train[0].shape[1], 2, emb_layer, model="LogisticRegression" ) emmental_model = EmmentalModel() for task in tasks: emmental_model.add_task(task)
def main( conn_string, stg_temp_min=False, stg_temp_max=False, polarity=False, ce_v_max=False, max_docs=float("inf"), parse=False, first_time=False, re_label=False, parallel=4, log_dir=None, verbose=False, ): if not log_dir: log_dir = "logs" if verbose: level = logging.INFO else: level = logging.WARNING dirname = os.path.dirname(os.path.abspath(__file__)) init_logging(log_dir=os.path.join(dirname, log_dir), level=level) rel_list = [] if stg_temp_min: rel_list.append("stg_temp_min") if stg_temp_max: rel_list.append("stg_temp_max") if polarity: rel_list.append("polarity") if ce_v_max: rel_list.append("ce_v_max") session = Meta.init(conn_string).Session() # Parsing logger.info(f"Starting parsing...") start = timer() docs, train_docs, dev_docs, test_docs = parse_dataset(session, dirname, first_time=parse, parallel=parallel, max_docs=max_docs) end = timer() logger.warning(f"Parse Time (min): {((end - start) / 60.0):.1f}") logger.info(f"# of train Documents: {len(train_docs)}") logger.info(f"# of dev Documents: {len(dev_docs)}") logger.info(f"# of test Documents: {len(test_docs)}") logger.info(f"Documents: {session.query(Document).count()}") logger.info(f"Sections: {session.query(Section).count()}") logger.info(f"Paragraphs: {session.query(Paragraph).count()}") logger.info(f"Sentences: {session.query(Sentence).count()}") logger.info(f"Figures: {session.query(Figure).count()}") # Mention Extraction start = timer() mentions = [] ngrams = [] matchers = [] # Only do those that are enabled Part = mention_subclass("Part") part_matcher = get_matcher("part") part_ngrams = MentionNgramsPart(parts_by_doc=None, n_max=3) mentions.append(Part) ngrams.append(part_ngrams) matchers.append(part_matcher) if stg_temp_min: StgTempMin = mention_subclass("StgTempMin") stg_temp_min_matcher = get_matcher("stg_temp_min") stg_temp_min_ngrams = MentionNgramsTemp(n_max=2) mentions.append(StgTempMin) ngrams.append(stg_temp_min_ngrams) matchers.append(stg_temp_min_matcher) if stg_temp_max: StgTempMax = mention_subclass("StgTempMax") stg_temp_max_matcher = get_matcher("stg_temp_max") stg_temp_max_ngrams = MentionNgramsTemp(n_max=2) mentions.append(StgTempMax) ngrams.append(stg_temp_max_ngrams) matchers.append(stg_temp_max_matcher) if polarity: Polarity = mention_subclass("Polarity") polarity_matcher = get_matcher("polarity") polarity_ngrams = MentionNgrams(n_max=1) mentions.append(Polarity) ngrams.append(polarity_ngrams) matchers.append(polarity_matcher) if ce_v_max: CeVMax = mention_subclass("CeVMax") ce_v_max_matcher = get_matcher("ce_v_max") ce_v_max_ngrams = MentionNgramsVolt(n_max=1) mentions.append(CeVMax) ngrams.append(ce_v_max_ngrams) matchers.append(ce_v_max_matcher) mention_extractor = MentionExtractor(session, mentions, ngrams, matchers) if first_time: mention_extractor.apply(docs, parallelism=parallel) logger.info(f"Total Mentions: {session.query(Mention).count()}") logger.info(f"Total Part: {session.query(Part).count()}") if stg_temp_min: logger.info(f"Total StgTempMin: {session.query(StgTempMin).count()}") if stg_temp_max: logger.info(f"Total StgTempMax: {session.query(StgTempMax).count()}") if polarity: logger.info(f"Total Polarity: {session.query(Polarity).count()}") if ce_v_max: logger.info(f"Total CeVMax: {session.query(CeVMax).count()}") # Candidate Extraction cands = [] throttlers = [] if stg_temp_min: PartStgTempMin = candidate_subclass("PartStgTempMin", [Part, StgTempMin]) stg_temp_min_throttler = stg_temp_filter cands.append(PartStgTempMin) throttlers.append(stg_temp_min_throttler) if stg_temp_max: PartStgTempMax = candidate_subclass("PartStgTempMax", [Part, StgTempMax]) stg_temp_max_throttler = stg_temp_filter cands.append(PartStgTempMax) throttlers.append(stg_temp_max_throttler) if polarity: PartPolarity = candidate_subclass("PartPolarity", [Part, Polarity]) polarity_throttler = polarity_filter cands.append(PartPolarity) throttlers.append(polarity_throttler) if ce_v_max: PartCeVMax = candidate_subclass("PartCeVMax", [Part, CeVMax]) ce_v_max_throttler = ce_v_max_filter cands.append(PartCeVMax) throttlers.append(ce_v_max_throttler) candidate_extractor = CandidateExtractor(session, cands, throttlers=throttlers) if first_time: for i, docs in enumerate([train_docs, dev_docs, test_docs]): candidate_extractor.apply(docs, split=i, parallelism=parallel) num_cands = session.query(Candidate).filter( Candidate.split == i).count() logger.info(f"Candidates in split={i}: {num_cands}") # These must be sorted for deterministic behavior. train_cands = candidate_extractor.get_candidates(split=0, sort=True) dev_cands = candidate_extractor.get_candidates(split=1, sort=True) test_cands = candidate_extractor.get_candidates(split=2, sort=True) end = timer() logger.warning( f"Candidate Extraction Time (min): {((end - start) / 60.0):.1f}") logger.info(f"Total train candidate: {sum(len(_) for _ in train_cands)}") logger.info(f"Total dev candidate: {sum(len(_) for _ in dev_cands)}") logger.info(f"Total test candidate: {sum(len(_) for _ in test_cands)}") pickle_file = os.path.join(dirname, "data/parts_by_doc_new.pkl") with open(pickle_file, "rb") as f: parts_by_doc = pickle.load(f) # Check total recall for i, name in enumerate(rel_list): logger.info(name) result = entity_level_scores( candidates_to_entities(dev_cands[i], parts_by_doc=parts_by_doc), attribute=name, corpus=dev_docs, ) logger.info(f"{name} Total Dev Recall: {result.rec:.3f}") result = entity_level_scores( candidates_to_entities(test_cands[i], parts_by_doc=parts_by_doc), attribute=name, corpus=test_docs, ) logger.info(f"{name} Total Test Recall: {result.rec:.3f}") # Featurization start = timer() cands = [] if stg_temp_min: cands.append(PartStgTempMin) if stg_temp_max: cands.append(PartStgTempMax) if polarity: cands.append(PartPolarity) if ce_v_max: cands.append(PartCeVMax) # Using parallelism = 1 for deterministic behavior. featurizer = Featurizer(session, cands, parallelism=1) if first_time: logger.info("Starting featurizer...") featurizer.apply(split=0, train=True) featurizer.apply(split=1) featurizer.apply(split=2) logger.info("Done") logger.info("Getting feature matrices...") if first_time: F_train = featurizer.get_feature_matrices(train_cands) F_dev = featurizer.get_feature_matrices(dev_cands) F_test = featurizer.get_feature_matrices(test_cands) end = timer() logger.warning( f"Featurization Time (min): {((end - start) / 60.0):.1f}") F_train_dict = {} F_dev_dict = {} F_test_dict = {} for idx, relation in enumerate(rel_list): F_train_dict[relation] = F_train[idx] F_dev_dict[relation] = F_dev[idx] F_test_dict[relation] = F_test[idx] pickle.dump(F_train_dict, open(os.path.join(dirname, "F_train_dict.pkl"), "wb")) pickle.dump(F_dev_dict, open(os.path.join(dirname, "F_dev_dict.pkl"), "wb")) pickle.dump(F_test_dict, open(os.path.join(dirname, "F_test_dict.pkl"), "wb")) else: F_train_dict = pickle.load( open(os.path.join(dirname, "F_train_dict.pkl"), "rb")) F_dev_dict = pickle.load( open(os.path.join(dirname, "F_dev_dict.pkl"), "rb")) F_test_dict = pickle.load( open(os.path.join(dirname, "F_test_dict.pkl"), "rb")) F_train = [] F_dev = [] F_test = [] for relation in rel_list: F_train.append(F_train_dict[relation]) F_dev.append(F_dev_dict[relation]) F_test.append(F_test_dict[relation]) logger.info("Done.") for i, cand in enumerate(cands): logger.info(f"{cand} Train shape: {F_train[i].shape}") logger.info(f"{cand} Test shape: {F_test[i].shape}") logger.info(f"{cand} Dev shape: {F_dev[i].shape}") logger.info("Labeling training data...") # Labeling start = timer() lfs = [] if stg_temp_min: lfs.append(stg_temp_min_lfs) if stg_temp_max: lfs.append(stg_temp_max_lfs) if polarity: lfs.append(polarity_lfs) if ce_v_max: lfs.append(ce_v_max_lfs) # Using parallelism = 1 for deterministic behavior. labeler = Labeler(session, cands, parallelism=1) if first_time: logger.info("Applying LFs...") labeler.apply(split=0, lfs=lfs, train=True) logger.info("Done...") # Uncomment if debugging LFs # load_transistor_labels(session, cands, ["ce_v_max"]) # labeler.apply(split=1, lfs=lfs, train=False, parallelism=parallel) # labeler.apply(split=2, lfs=lfs, train=False, parallelism=parallel) elif re_label: logger.info("Updating LFs...") labeler.update(split=0, lfs=lfs) logger.info("Done...") # Uncomment if debugging LFs # labeler.apply(split=1, lfs=lfs, train=False, parallelism=parallel) # labeler.apply(split=2, lfs=lfs, train=False, parallelism=parallel) logger.info("Getting label matrices...") L_train = labeler.get_label_matrices(train_cands) # Uncomment if debugging LFs # L_dev = labeler.get_label_matrices(dev_cands) # L_dev_gold = labeler.get_gold_labels(dev_cands, annotator="gold") # # L_test = labeler.get_label_matrices(test_cands) # L_test_gold = labeler.get_gold_labels(test_cands, annotator="gold") logger.info("Done.") if first_time: marginals_dict = {} for idx, relation in enumerate(rel_list): marginals_dict[relation] = generative_model(L_train[idx]) pickle.dump(marginals_dict, open(os.path.join(dirname, "marginals_dict.pkl"), "wb")) else: marginals_dict = pickle.load( open(os.path.join(dirname, "marginals_dict.pkl"), "rb")) marginals = [] for relation in rel_list: marginals.append(marginals_dict[relation]) end = timer() logger.warning(f"Supervision Time (min): {((end - start) / 60.0):.1f}") start = timer() word_counter = collect_word_counter(train_cands) # Training config config = { "meta_config": { "verbose": True, "seed": 17 }, "model_config": { "model_path": None, "device": 0, "dataparallel": False }, "learner_config": { "n_epochs": 5, "optimizer_config": { "lr": 0.001, "l2": 0.0 }, "task_scheduler": "round_robin", }, "logging_config": { "evaluation_freq": 1, "counter_unit": "epoch", "checkpointing": False, "checkpointer_config": { "checkpoint_metric": { "model/all/train/loss": "min" }, "checkpoint_freq": 1, "checkpoint_runway": 2, "clear_intermediate_checkpoints": True, "clear_all_checkpoints": True, }, }, } emmental.init(log_dir=Meta.log_path, config=config) # Generate word embedding module arity = 2 # Geneate special tokens specials = [] for i in range(arity): specials += [f"~~[[{i}", f"{i}]]~~"] emb_layer = EmbeddingModule(word_counter=word_counter, word_dim=300, specials=specials) train_idxs = [] train_dataloader = [] for idx, relation in enumerate(rel_list): diffs = marginals[idx].max(axis=1) - marginals[idx].min(axis=1) train_idxs.append(np.where(diffs > 1e-6)[0]) train_dataloader.append( EmmentalDataLoader( task_to_label_dict={relation: "labels"}, dataset=FonduerDataset( relation, train_cands[idx], F_train[idx], emb_layer.word2id, marginals[idx], train_idxs[idx], ), split="train", batch_size=100, shuffle=True, )) num_feature_keys = len(featurizer.get_keys()) model = EmmentalModel(name=f"transistor_tasks") # List relation names, arities, list of classes tasks = create_task( rel_list, [2] * len(rel_list), num_feature_keys, [2] * len(rel_list), emb_layer, model="LogisticRegression", ) for task in tasks: model.add_task(task) emmental_learner = EmmentalLearner() # If given a list of multi, will train on multiple emmental_learner.learn(model, train_dataloader) # List of dataloader for each rlation for idx, relation in enumerate(rel_list): test_dataloader = EmmentalDataLoader( task_to_label_dict={relation: "labels"}, dataset=FonduerDataset(relation, test_cands[idx], F_test[idx], emb_layer.word2id, 2), split="test", batch_size=100, shuffle=False, ) test_preds = model.predict(test_dataloader, return_preds=True) best_result, best_b = scoring( relation, test_preds, test_cands[idx], test_docs, F_test[idx], parts_by_doc, num=100, ) # Dump CSV files for CE_V_MAX for digi-key analysis if relation == "ce_v_max": dev_dataloader = EmmentalDataLoader( task_to_label_dict={relation: "labels"}, dataset=FonduerDataset(relation, dev_cands[idx], F_dev[idx], emb_layer.word2id, 2), split="dev", batch_size=100, shuffle=False, ) dev_preds = model.predict(dev_dataloader, return_preds=True) Y_prob = np.array(test_preds["probs"][relation])[:, TRUE] dump_candidates(test_cands[idx], Y_prob, "ce_v_max_test_probs.csv") Y_prob = np.array(dev_preds["probs"][relation])[:, TRUE] dump_candidates(dev_cands[idx], Y_prob, "ce_v_max_dev_probs.csv") # Dump CSV files for POLARITY for digi-key analysis if relation == "polarity": dev_dataloader = EmmentalDataLoader( task_to_label_dict={relation: "labels"}, dataset=FonduerDataset(relation, dev_cands[idx], F_dev[idx], emb_layer.word2id, 2), split="dev", batch_size=100, shuffle=False, ) dev_preds = model.predict(dev_dataloader, return_preds=True) Y_prob = np.array(test_preds["probs"][relation])[:, TRUE] dump_candidates(test_cands[idx], Y_prob, "polarity_test_probs.csv") Y_prob = np.array(dev_preds["probs"][relation])[:, TRUE] dump_candidates(dev_cands[idx], Y_prob, "polarity_dev_probs.csv") end = timer() logger.warning(f"Classification Time (min): {((end - start) / 60.0):.1f}")