def train_model_random_lfs(randomly_sampled_lfs, train_matrix, dev_matrix, dev_labels, test_matrix, regularization_grid): hyper_grid_results = defaultdict(dict) train_grid_results = defaultdict(dict) dev_grid_results = defaultdict(dict) test_grid_results = defaultdict(dict) for lf_sample in tqdm_notebook(enumerate(randomly_sampled_lfs)): for param in regularization_grid: label_model = LabelModel(k=2) label_model.train_model( train_matrix[:,lf_sample[1]], n_epochs=1000, log_train_every=200, seed=100, lr=0.01, l2=param, verbose=False ) hyper_grid_results[str(param)] = label_model.predict_proba(dev_matrix[:,lf_sample[1]]) best_param = float(max(hyper_grid_results)) label_model.train_model( train_matrix[:,lf_sample[1]], n_epochs=1000, log_train_every=200, seed=50, lr=0.01, l2=best_param, verbose=False ) key = f'{lf_sample[0]}:{",".join(map(str, lf_sample[1]))}' train_grid_results[key] = label_model.predict_proba(train_matrix[:,lf_sample[1]]) dev_grid_results[key] = label_model.predict_proba(dev_matrix[:,lf_sample[1]]) test_grid_results[key] = label_model.predict_proba(test_matrix[:,lf_sample[1]]) return train_grid_results, dev_grid_results, test_grid_results
def generative_model(L_train, n_epochs=500, print_every=100): model = LabelModel(k=2) logger.info("Training generative model...") model.train_model(L_train, n_epochs=n_epochs, print_every=print_every) logger.info("Done.") marginals = model.predict_proba(L_train) return marginals
def generative_model(L_train, n_epochs=500, print_every=100): model = LabelModel(k=2) logger.info(f"Training generative model for...") model.train_model(L_train, n_epochs=n_epochs, print_every=print_every) logger.info("Done.") marginals = model.predict_proba(L_train) plt.hist(marginals[:, TRUE - 1], bins=20) plt.savefig( os.path.join(os.path.dirname(__file__), f"opamps_marginals.pdf")) return marginals
def __init__( self, positive_label: str, class_cardinality: int = 2, num_epochs: int = 500, log_train_every: int = 50, seed: int = 123, ): self.positive_label = positive_label self.class_cardinality = class_cardinality self.num_epochs = num_epochs self.log_train_every = log_train_every self.seed = seed self.label_model = LabelModel(k=self.class_cardinality, seed=seed)
def apply_labellling_functions(featurizer_output): session = featurizer_output['session'] cands = featurizer_output['candidate_variable'] labeler = Labeler(session, cands) labeler.apply(lfs=[lfs], train=True, parallelism=config.PARALLEL) train_cands = [] train_cands.append( session.query(featurizer_output['candidate_variable'][0]).all()) L_train = labeler.get_label_matrices(train_cands) gen_model = LabelModel(k=2) gen_model.train_model(L_train[0], n_epochs=300, print_every=100) train_marginals = gen_model.predict_proba(L_train[0]) featurizer_output['train_marginals'] = train_marginals return featurizer_output
def test_gpustorage(self): # Running basics tutorial problem with open("tutorials/data/basics_tutorial.pkl", "rb") as f: X, Y, L, D = pickle.load(f) Xs, Ys, Ls, Ds = split_data(X, Y, L, D, splits=[0.8, 0.1, 0.1], stratify_by=Y, seed=123) label_model = LabelModel(k=2, seed=123) label_model.train_model(Ls[0], Y_dev=Ys[1], n_epochs=500, log_train_every=25) Y_train_ps = label_model.predict_proba(Ls[0]) # Creating a really large end model to use lots of memory end_model = EndModel([1000, 100000, 2], seed=123, device="cuda") # Getting initial GPU storage use initial_gpu_mem = GPUtil.getGPUs()[0].memoryUsed # Training model end_model.train_model( (Xs[0], Y_train_ps), valid_data=(Xs[1], Ys[1]), l2=0.1, batch_size=256, n_epochs=3, log_train_every=1, validation_metric="f1", ) # Final GPU storage use final_gpu_mem = GPUtil.getGPUs()[0].memoryUsed # On a Titan X, this model uses ~ 3 GB of memory gpu_mem_difference = final_gpu_mem - initial_gpu_mem self.assertGreater(gpu_mem_difference, 1000)
def load_context(self, context: PythonModelContext) -> None: # Configure logging for Fonduer init_logging(log_dir="logs") logger.info("loading context") pyfunc_conf = _get_flavor_configuration(model_path=self.model_path, flavor_name=pyfunc.FLAVOR_NAME) conn_string = pyfunc_conf.get(CONN_STRING, None) if conn_string is None: raise RuntimeError("conn_string is missing from MLmodel file.") self.parallel = pyfunc_conf.get(PARALLEL, 1) session = Meta.init(conn_string).Session() logger.info("Getting parser") self.corpus_parser = self._get_parser(session) logger.info("Getting mention extractor") self.mention_extractor = self._get_mention_extractor(session) logger.info("Getting candidate extractor") self.candidate_extractor = self._get_candidate_extractor(session) candidate_classes = self.candidate_extractor.candidate_classes self.model_type = pyfunc_conf.get(MODEL_TYPE, "discriminative") if self.model_type == "discriminative": self.featurizer = Featurizer(session, candidate_classes) with open(os.path.join(self.model_path, "feature_keys.pkl"), "rb") as f: key_names = pickle.load(f) self.featurizer.drop_keys(key_names) self.featurizer.upsert_keys(key_names) disc_model = LogisticRegression() # Workaround to https://github.com/HazyResearch/fonduer/issues/208 checkpoint = torch.load( os.path.join(self.model_path, "best_model.pt")) disc_model.settings = checkpoint["config"] disc_model.cardinality = checkpoint["cardinality"] disc_model._build_model() disc_model.load(model_file="best_model.pt", save_dir=self.model_path) self.disc_model = disc_model else: self.labeler = Labeler(session, candidate_classes) with open(os.path.join(self.model_path, "labeler_keys.pkl"), "rb") as f: key_names = pickle.load(f) self.labeler.drop_keys(key_names) self.labeler.upsert_keys(key_names) self.gen_models = [ LabelModel.load( os.path.join(self.model_path, _.__name__ + ".pkl")) for _ in candidate_classes ]
def train_baseline_model( train_matrix, dev_matrix, dev_labels, test_matrix, lf_indicies, regularization_grid, train_marginal_dir, write_file=False ): grid_results = {} dev_grid_results = {} test_grid_results = {} for param in regularization_grid: label_model = LabelModel(k=2) label_model.train_model( train_matrix[:,lf_indicies], n_epochs=1000, log_train_every=200, seed=100, lr=0.01, l2=param, verbose=False, #Y_dev=dev_labels ) grid_results[str(param)] = label_model.predict_proba(dev_matrix[:,lf_indicies]) best_param = float(max(grid_results)) label_model.train_model( train_matrix[:,lf_indicies], n_epochs=1000, log_train_every=200, seed=50, lr=0.01, l2=best_param, verbose=False, #Y_dev=dev_labels ) if write_file: ( pd.DataFrame( label_model.predict_proba(train_matrix[:,lf_indicies]), columns=["pos_class_marginals", "neg_class_marginals"] ) .to_csv(f"{train_marginal_dir}baseline_marginals.tsv.xz", compression="xz", index=False, sep="\t") ) dev_grid_results[best_param] = label_model.predict_proba(dev_matrix[:,lf_indicies]) test_grid_results[best_param] = label_model.predict_proba(test_matrix[:,lf_indicies]) return dev_grid_results, test_grid_results
def getTrainedModel1(self): # We build a matrix of LF votes for each comment ticket LF_matrix = self.make_Ls_matrix(self.LF_set['comments'], self.LFs) # Get true labels for LF set Y_LF_set = np.array(self.LF_set['resolution']) display( lf_summary(sparse.csr_matrix(LF_matrix), Y=Y_LF_set, lf_names=self.LF_names.values())) print("label coverage: " + label_coverage(LF_matrix)) mv = MajorityLabelVoter() Y_train_majority_votes = mv.predict(LF_matrix) print("classification report:\n" + classification_report(Y_LF_set, Y_train_majority_votes)) Ls_train = self.make_Ls_matrix(self.train, self.LFs) # You can tune the learning rate and class balance. model = LabelModel(k=2, seed=123) trainer = model.train_model(Ls_train, n_epochs=2000, print_every=1000, lr=0.0001, class_balance=np.array([0.2, 0.8])) Y_train = model.predict(Ls_train) + Y_LF_set print('Trained Label Model Metrics:') scores = model.score((Ls_train[1], Y_train[1]), metric=['accuracy', 'precision', 'recall', 'f1']) print(scores) return trainer, Y_train
def getTrainedModel2(self): # Apply the LFs to the unlabeled training data applier = PandasLFApplier(self.LFs) L_train = applier.apply(self.train['comments']) # Train the label model and compute the training labels label_model = LabelModel(cardinality=2, verbose=True) label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123) self.train['resolution'] = label_model.predict( L=L_train, tie_break_policy="abstain") df_train = self.train[self.train.resolution != self.ABSTAIN] train_text = df_train.comments.tolist() X_train = CountVectorizer(ngram_range=(1, 2)).fit_transform(train_text) clf = LogisticRegression(solver="lbfgs") clf.fit(X=X_train, y=df_train.resolution.values) prob = clf.predict_proba(self.test) if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' end_model = EndModel([1000, 10, 2], seed=123, device=device) end_model.train_model( (self.train['comments'], self.test['comments']), valid_data=(self.train['resolution'], self.test['comments']), lr=0.01, l2=0.01, batch_size=256, n_epochs=5, checkpoint_metric='accuracy', checkpoint_metric_mode='max') return prob
def LabelLearner(cardinalities=2, dependencies=[], **kwargs): """ A generative factory function for data programming (MeTal). """ # Use single task label model if cardinality is an integer, # Otherwise use multi task label model if isinstance(cardinalities, int): logger.info("Using MeTaL single task label model") label_model = LabelModel(k=cardinalities, **kwargs) else: logger.info("Using MeTaL multi task label model") task_graph = TaskHierarchy(cardinalities=cardinalities, edges=dependencies) label_model = MTLabelModel(task_graph=task_graph, **kwargs) return label_model
def predict_proba(self, L): """Returns the task marginals estimated by the model: a t-length list of [n,k_t] matrices where the (i,j) entry of the sth matrix represents the estimated P((Y_i)_s | \lambda_j(x_i)) Args: L: A t-length list of [n,m] scipy.sparse label matrices with values in {0,1,...,k} """ # First, get the estimated probability distribution over the feasible # set defined by the TaskGraph # This is an [n,k] array, where k = |(feasible set)| Y_pf = LabelModel.predict_proba(self, L) n, k = Y_pf.shape # Now get the per-task marginals # TODO: Make this optional, versus just returning the above Y_p = [np.zeros((n, k_t)) for k_t in self.task_graph.K] for yi, y in enumerate(self.task_graph.feasible_set()): for t in range(self.t): k_t = int(y[t]) Y_p[t][:, k_t - 1] += Y_pf[:, yi] return Y_p
def train_model(args): #global args #args = parser.parse_args() hidden_size = 128 num_classes = 2 encode_dim = 1000 # using get_frm_output_size() L, Y = load_labels(args) # Label Model # labelling functions analysis print(lf_summary(L["dev"], Y=Y["dev"])) # training label model label_model = LabelModel(k=num_classes, seed=123) label_model.train_model(L["train"], Y["dev"], n_epochs=2000, log_train_every=100) # evaluating label model print('Trained Label Model Metrics:') label_model.score((L["dev"], Y["dev"]), metric=['accuracy', 'precision', 'recall', 'f1']) # comparison with majority vote of LFs mv = MajorityLabelVoter(seed=123) print('Majority Label Voter Metrics:') mv.score((L["dev"], Y["dev"]), metric=['accuracy', 'precision', 'recall', 'f1']) Ytrain_p = label_model.predict_proba(L["train"]) #print(Ytrain_ps.shape) #(377*50,2) #Ydev_p = label_model.predict_proba(L["dev"]) # test models #label_model.score((Ltest,Ytest), metric=['accuracy','precision', 'recall', 'f1']) # End Model # Create datasets and dataloaders train, dev, test = load_dataset(args, Ytrain_p, Y["dev"], Y["test"]) data_loader = get_data_loader(train, dev, test, args.batch_size, args.num_workers) #print(len(data_loader["train"])) # 18850 / batch_size #print(len(data_loader["dev"])) # 1500 / batch_size #print(len(data_loader["test"])) # 1000 / batch_size #import ipdb; ipdb.set_trace() # Define input encoder #cnn_encoder = FrameEncoderOC cnn_encoder = FrameEncoderOCDense if (torch.cuda.is_available()): device = 'cuda' else: device = 'cpu' #import ipdb; ipdb.set_trace() # Define LSTM module lstm_module = LSTMModule( encode_dim, hidden_size, bidirectional=False, verbose=False, lstm_reduction=args.lstm_reduction, encoder_class=cnn_encoder, encoder_kwargs={"requires_grad": args.requires_grad}) ''' # Define end model end_model = EndModel( input_module=lstm_module, layer_out_dims=[hidden_size, num_classes], optimizer="adam", #use_cuda=cuda, batchnorm=False, seed=args.seed, verbose=False, device = device, ) ''' init_kwargs = { "layer_out_dims": [hidden_size, num_classes], "input_module": lstm_module, "optimizer": "adam", "verbose": False, "input_batchnorm": False, "use_cuda": cuda, 'seed': args.seed, 'device': device } end_model = EndModel(**init_kwargs) if not os.path.exists(args.checkpoint_dir): os.mkdir(args.checkpoint_dir) with open(args.checkpoint_dir + '/init_kwargs.pickle', "wb") as f: pickle.dump(init_kwargs, f, protocol=pickle.HIGHEST_PROTOCOL) dropout = 0.4 # Train end model end_model.train_model( train_data=data_loader["train"], valid_data=data_loader["dev"], l2=args.weight_decay, lr=args.lr, n_epochs=args.n_epochs, log_train_every=1, verbose=True, progress_bar=True, loss_weights=[0.55, 0.45], input_dropout=0.1, middle_dropout=dropout, checkpoint_dir=args.checkpoint_dir, #writer = "json", #writer_config = { #"log_dir": args.log_dir, #"run_dir": args.run_dir, #"run_name": args.run_name, #"writer_metrics": ['accuracy','precision', 'recall', 'f1','roc-auc','ndcg'] #}, #validation_metric='f1', ) # evaluate end model print("Dev Set Performance") end_model.score( data_loader["dev"], verbose=True, metric=['accuracy', 'precision', 'recall', 'f1', 'roc-auc', 'ndcg']) print("Test Set Performance") end_model.score( data_loader["test"], verbose=True, metric=['accuracy', 'precision', 'recall', 'f1', 'roc-auc', 'ndcg'])
def train_model(args): #global args #args = parser.parse_args() hidden_size = 128 num_classes = 2 encode_dim = 1000 # using get_frm_output_size() L,Y = load_labels(args) # Label Model # labelling functions analysis print(lf_summary(L["dev"], Y = Y["dev"])) # training label model label_model = LabelModel(k=num_classes, seed=123) label_model.train_model(L["train"], Y["dev"], n_epochs = 500, log_train_every = 50) # evaluating label model print('Trained Label Model Metrics:') label_model.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1']) # comparison with majority vote of LFs mv = MajorityLabelVoter(seed=123) print('Majority Label Voter Metrics:') mv.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1']) Ytrain_p = label_model.predict_proba(L["train"]) #print(Ytrain_ps.shape) #(377*50,2) #Ydev_p = label_model.predict_proba(L["dev"]) # test models #label_model.score((Ltest,Ytest), metric=['accuracy','precision', 'recall', 'f1']) # End Model # Create datasets and dataloaders train, dev, test = load_dataset(args, Ytrain_p, Y["dev"], Y["test"]) data_loader = get_data_loader(train, dev, test, args.batch_size, args.num_workers) #print(len(data_loader["train"])) # 18850 / batch_size #print(len(data_loader["dev"])) # 1500 / batch_size #print(len(data_loader["test"])) # 1000 / batch_size #import ipdb; ipdb.set_trace() # Define input encoder cnn_encoder = FrameEncoderOC if(torch.cuda.is_available()): device = 'cuda' else: device = 'cpu' #import ipdb; ipdb.set_trace() # Define LSTM module lstm_module = LSTMModule( encode_dim, hidden_size, bidirectional=False, verbose=False, lstm_reduction="attention", encoder_class=cnn_encoder, ) # Define end model end_model = EndModel( input_module=lstm_module, layer_out_dims=[hidden_size, num_classes], optimizer="adam", #use_cuda=cuda, batchnorm=True, seed=123, verbose=False, device = device, ) #print('Training model') #tic = time.time() dropout = 0.4 # Train end model end_model.train_model( train_data=data_loader["train"], valid_data=data_loader["dev"], l2=args.weight_decay, lr=args.lr, n_epochs=args.n_epochs, log_train_every=1, verbose=True, progress_bar = True, loss_weights = [0.45,0.55], batchnorm = 'True', input_dropout = dropout, middle_dropout = dropout, #validation_metric='f1', ) #print('Time taken for training:') #print(time.time() - tic) # evaluate end model end_model.score(data_loader["dev"], verbose=True, metric=['accuracy','precision', 'recall', 'f1'])
def test_e2e(caplog): """Run an end-to-end test on documents of the hardware domain.""" caplog.set_level(logging.INFO) PARALLEL = 4 max_docs = 12 session = Meta.init("postgresql://localhost:5432/" + DB).Session() docs_path = "tests/data/html/" pdf_path = "tests/data/pdf/" doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs) corpus_parser = Parser( session, parallelism=PARALLEL, structural=True, lingual=True, visual=True, pdf_path=pdf_path, ) corpus_parser.apply(doc_preprocessor) assert session.query(Document).count() == max_docs num_docs = session.query(Document).count() logger.info(f"Docs: {num_docs}") assert num_docs == max_docs num_sentences = session.query(Sentence).count() logger.info(f"Sentences: {num_sentences}") # Divide into test and train docs = sorted(corpus_parser.get_documents()) last_docs = sorted(corpus_parser.get_last_documents()) ld = len(docs) assert ld == len(last_docs) assert len(docs[0].sentences) == len(last_docs[0].sentences) assert len(docs[0].sentences) == 799 assert len(docs[1].sentences) == 663 assert len(docs[2].sentences) == 784 assert len(docs[3].sentences) == 661 assert len(docs[4].sentences) == 513 assert len(docs[5].sentences) == 700 assert len(docs[6].sentences) == 528 assert len(docs[7].sentences) == 161 assert len(docs[8].sentences) == 228 assert len(docs[9].sentences) == 511 assert len(docs[10].sentences) == 331 assert len(docs[11].sentences) == 528 # Check table numbers assert len(docs[0].tables) == 9 assert len(docs[1].tables) == 9 assert len(docs[2].tables) == 14 assert len(docs[3].tables) == 11 assert len(docs[4].tables) == 11 assert len(docs[5].tables) == 10 assert len(docs[6].tables) == 10 assert len(docs[7].tables) == 2 assert len(docs[8].tables) == 7 assert len(docs[9].tables) == 10 assert len(docs[10].tables) == 6 assert len(docs[11].tables) == 9 # Check figure numbers assert len(docs[0].figures) == 32 assert len(docs[1].figures) == 11 assert len(docs[2].figures) == 38 assert len(docs[3].figures) == 31 assert len(docs[4].figures) == 7 assert len(docs[5].figures) == 38 assert len(docs[6].figures) == 10 assert len(docs[7].figures) == 31 assert len(docs[8].figures) == 4 assert len(docs[9].figures) == 27 assert len(docs[10].figures) == 5 assert len(docs[11].figures) == 27 # Check caption numbers assert len(docs[0].captions) == 0 assert len(docs[1].captions) == 0 assert len(docs[2].captions) == 0 assert len(docs[3].captions) == 0 assert len(docs[4].captions) == 0 assert len(docs[5].captions) == 0 assert len(docs[6].captions) == 0 assert len(docs[7].captions) == 0 assert len(docs[8].captions) == 0 assert len(docs[9].captions) == 0 assert len(docs[10].captions) == 0 assert len(docs[11].captions) == 0 train_docs = set() dev_docs = set() test_docs = set() splits = (0.5, 0.75) data = [(doc.name, doc) for doc in docs] data.sort(key=lambda x: x[0]) for i, (doc_name, doc) in enumerate(data): if i < splits[0] * ld: train_docs.add(doc) elif i < splits[1] * ld: dev_docs.add(doc) else: test_docs.add(doc) logger.info([x.name for x in train_docs]) # NOTE: With multi-relation support, return values of getting candidates, # mentions, or sparse matrices are formatted as a list of lists. This means # that with a single relation, we need to index into the list of lists to # get the candidates/mentions/sparse matrix for a particular relation or # mention. # Mention Extraction part_ngrams = MentionNgramsPart(parts_by_doc=None, n_max=3) temp_ngrams = MentionNgramsTemp(n_max=2) volt_ngrams = MentionNgramsVolt(n_max=1) Part = mention_subclass("Part") Temp = mention_subclass("Temp") Volt = mention_subclass("Volt") mention_extractor = MentionExtractor( session, [Part, Temp, Volt], [part_ngrams, temp_ngrams, volt_ngrams], [part_matcher, temp_matcher, volt_matcher], ) mention_extractor.apply(docs, parallelism=PARALLEL) assert session.query(Part).count() == 299 assert session.query(Temp).count() == 147 assert session.query(Volt).count() == 140 assert len(mention_extractor.get_mentions()) == 3 assert len(mention_extractor.get_mentions()[0]) == 299 assert ( len( mention_extractor.get_mentions( docs=[session.query(Document).filter(Document.name == "112823").first()] )[0] ) == 70 ) # Candidate Extraction PartTemp = candidate_subclass("PartTemp", [Part, Temp]) PartVolt = candidate_subclass("PartVolt", [Part, Volt]) candidate_extractor = CandidateExtractor( session, [PartTemp, PartVolt], throttlers=[temp_throttler, volt_throttler] ) for i, docs in enumerate([train_docs, dev_docs, test_docs]): candidate_extractor.apply(docs, split=i, parallelism=PARALLEL) assert session.query(PartTemp).filter(PartTemp.split == 0).count() == 3684 assert session.query(PartTemp).filter(PartTemp.split == 1).count() == 72 assert session.query(PartTemp).filter(PartTemp.split == 2).count() == 448 assert session.query(PartVolt).count() == 4282 # Grab candidate lists train_cands = candidate_extractor.get_candidates(split=0, sort=True) dev_cands = candidate_extractor.get_candidates(split=1, sort=True) test_cands = candidate_extractor.get_candidates(split=2, sort=True) assert len(train_cands) == 2 assert len(train_cands[0]) == 3684 assert ( len( candidate_extractor.get_candidates( docs=[session.query(Document).filter(Document.name == "112823").first()] )[0] ) == 1496 ) # Featurization featurizer = Featurizer(session, [PartTemp, PartVolt]) # Test that FeatureKey is properly reset featurizer.apply(split=1, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 225 assert session.query(FeatureKey).count() == 1179 # Test Dropping FeatureKey # Should force a row deletion featurizer.drop_keys(["DDL_e1_W_LEFT_POS_3_[NFP NN NFP]"]) assert session.query(FeatureKey).count() == 1178 # Should only remove the part_volt as a relation and leave part_temp assert set( session.query(FeatureKey) .filter(FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]") .one() .candidate_classes ) == {"part_temp", "part_volt"} featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartVolt]) assert session.query(FeatureKey).filter( FeatureKey.name == "DDL_e1_LEMMA_SEQ_[bc182]" ).one().candidate_classes == ["part_temp"] assert session.query(FeatureKey).count() == 1178 # Removing the last relation from a key should delete the row featurizer.drop_keys(["DDL_e1_LEMMA_SEQ_[bc182]"], candidate_classes=[PartTemp]) assert session.query(FeatureKey).count() == 1177 session.query(Feature).delete() session.query(FeatureKey).delete() featurizer.apply(split=0, train=True, parallelism=PARALLEL) assert session.query(Feature).count() == 6669 assert session.query(FeatureKey).count() == 4161 F_train = featurizer.get_feature_matrices(train_cands) assert F_train[0].shape == (3684, 4161) assert F_train[1].shape == (2985, 4161) assert len(featurizer.get_keys()) == 4161 featurizer.apply(split=1, parallelism=PARALLEL) assert session.query(Feature).count() == 6894 assert session.query(FeatureKey).count() == 4161 F_dev = featurizer.get_feature_matrices(dev_cands) assert F_dev[0].shape == (72, 4161) assert F_dev[1].shape == (153, 4161) featurizer.apply(split=2, parallelism=PARALLEL) assert session.query(Feature).count() == 8486 assert session.query(FeatureKey).count() == 4161 F_test = featurizer.get_feature_matrices(test_cands) assert F_test[0].shape == (448, 4161) assert F_test[1].shape == (1144, 4161) gold_file = "tests/data/hardware_tutorial_gold.csv" load_hardware_labels(session, PartTemp, gold_file, ATTRIBUTE, annotator_name="gold") assert session.query(GoldLabel).count() == 4204 load_hardware_labels(session, PartVolt, gold_file, ATTRIBUTE, annotator_name="gold") assert session.query(GoldLabel).count() == 8486 stg_temp_lfs = [ LF_storage_row, LF_operating_row, LF_temperature_row, LF_tstg_row, LF_to_left, LF_negative_number_left, ] ce_v_max_lfs = [ LF_bad_keywords_in_row, LF_current_in_row, LF_non_ce_voltages_in_row, ] labeler = Labeler(session, [PartTemp, PartVolt]) with pytest.raises(ValueError): labeler.apply(split=0, lfs=stg_temp_lfs, train=True, parallelism=PARALLEL) labeler.apply( split=0, lfs=[stg_temp_lfs, ce_v_max_lfs], train=True, parallelism=PARALLEL ) assert session.query(Label).count() == 6669 assert session.query(LabelKey).count() == 9 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3684, 9) assert L_train[1].shape == (2985, 9) assert len(labeler.get_keys()) == 9 L_train_gold = labeler.get_gold_labels(train_cands) assert L_train_gold[0].shape == (3684, 1) L_train_gold = labeler.get_gold_labels(train_cands, annotator="gold") assert L_train_gold[0].shape == (3684, 1) gen_model = LabelModel(k=2) gen_model.train_model(L_train[0], n_epochs=500, print_every=100) train_marginals = gen_model.predict_proba(L_train[0])[:, 1] disc_model = LogisticRegression() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=20, lr=0.001 ) test_score = disc_model.predictions((test_cands[0], F_test[0]), b=0.6) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score > 0))] pickle_file = "tests/data/parts_by_doc_dict.pkl" with open(pickle_file, "rb") as f: parts_by_doc = pickle.load(f) (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 < 0.7 and f1 > 0.3 stg_temp_lfs_2 = [ LF_to_left, LF_test_condition_aligned, LF_collector_aligned, LF_current_aligned, LF_voltage_row_temp, LF_voltage_row_part, LF_typ_row, LF_complement_left_row, LF_too_many_numbers_row, LF_temp_on_high_page_num, LF_temp_outside_table, LF_not_temp_relevant, ] labeler.update(split=0, lfs=[stg_temp_lfs_2, ce_v_max_lfs], parallelism=PARALLEL) assert session.query(Label).count() == 6669 assert session.query(LabelKey).count() == 16 L_train = labeler.get_label_matrices(train_cands) assert L_train[0].shape == (3684, 16) gen_model = LabelModel(k=2) gen_model.train_model(L_train[0], n_epochs=500, print_every=100) train_marginals = gen_model.predict_proba(L_train[0])[:, 1] disc_model = LogisticRegression() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=20, lr=0.001 ) test_score = disc_model.predictions((test_cands[0], F_test[0]), b=0.6) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score > 0))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing LSTM disc_model = LSTM() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001 ) test_score = disc_model.predictions((test_cands[0], F_test[0]), b=0.6) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score > 0))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing Sparse Logistic Regression disc_model = SparseLogisticRegression() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=20, lr=0.001 ) test_score = disc_model.predictions((test_cands[0], F_test[0]), b=0.6) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score > 0))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7 # Testing Sparse LSTM disc_model = SparseLSTM() disc_model.train( (train_cands[0], F_train[0]), train_marginals, n_epochs=5, lr=0.001 ) test_score = disc_model.predictions((test_cands[0], F_test[0]), b=0.6) true_pred = [test_cands[0][_] for _ in np.nditer(np.where(test_score > 0))] (TP, FP, FN) = entity_level_f1( true_pred, gold_file, ATTRIBUTE, test_docs, parts_by_doc=parts_by_doc ) tp_len = len(TP) fp_len = len(FP) fn_len = len(FN) prec = tp_len / (tp_len + fp_len) if tp_len + fp_len > 0 else float("nan") rec = tp_len / (tp_len + fn_len) if tp_len + fn_len > 0 else float("nan") f1 = 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else float("nan") logger.info(f"prec: {prec}") logger.info(f"rec: {rec}") logger.info(f"f1: {f1}") assert f1 > 0.7
class SnorkeMeTalCollator(Collator): def __init__( self, positive_label: str, class_cardinality: int = 2, num_epochs: int = 500, log_train_every: int = 50, seed: int = 123, ): self.positive_label = positive_label self.class_cardinality = class_cardinality self.num_epochs = num_epochs self.log_train_every = log_train_every self.seed = seed self.label_model = LabelModel(k=self.class_cardinality, seed=seed) @classmethod def get_snorkel_index(cls, tag: str) -> int: if is_positive(tag): return 2 elif is_negative(tag): return 1 else: return 0 def get_tag(self, index: int) -> str: if index == 1: return self.positive_label else: return NEGATIVE_LABEL def get_index(self, prob: np.ndarray) -> str: assert prob.shape == (2, ) return prob.argmax() def collate_np(self, annotations) -> Tuple[np.ndarray, List[str], List[int]]: output_arrs: List[np.ndarray] = [] words_list: List[str] = [] id_to_labels: Dict[int, Tuple[int, int]] = {} num_funcs = len(annotations) for i, ann_inst in tqdm(enumerate(zip(*annotations))): ids = [inst['id'] for inst in ann_inst] inputs = [inst['input'] for inst in ann_inst] outputs = [inst['output'] for inst in ann_inst] input_len = len(inputs[0]) entry_id = ids[0] # output arr = (sentence x num_labels) output_arr = np.zeros((input_len, num_funcs)) for i, output in enumerate(outputs): for j, out_j in enumerate(output): output_arr[j, i] = SnorkeMeTalCollator.get_snorkel_index( out_j) label_start = len(words_list) for word_i, word in enumerate(inputs[0]): words_list.append(word) output_arrs.append(output_arr) label_end = len(words_list) id_to_labels[entry_id] = (label_start, label_end) output_res = np.concatenate(output_arrs, axis=0) return output_res, words_list, id_to_labels def train_label_model( self, collated_labels: np.ndarray, descriptions: Optional[List[str]], train_data_np: Optional[np.ndarray], ): sparse_labels = sparse.csr_matrix(collated_labels) if descriptions is not None: descriptions = [(i, desc) for i, desc in enumerate(descriptions)] logger.warn(f'labeling function order: {descriptions}') logger.warn(lf_summary(sparse_labels)) self.label_model.train_model( sparse_labels, n_epochs=self.num_epochs, log_train_every=self.log_train_every, Y_dev=train_data_np, ) def get_probabilistic_labels(self, collated_labels: np.ndarray) -> np.ndarray: sparse_labels = sparse.csr_matrix(collated_labels) return self.label_model.predict_proba(sparse_labels) def convert_to_tags( self, train_probs: np.ndarray, word_list: List[str], id_to_labels: Dict[int, Tuple[int, int]], ) -> List[AnnotatedDataType]: output = [] for entry_id, (label_start, label_end) in id_to_labels.items(): words = word_list[label_start:label_end] prob_labels = train_probs[label_start:label_end] label_ids = prob_labels.argmax(axis=1) labels = [self.get_tag(i) for i in label_ids] output.append({ 'id': entry_id, 'input': words, 'output': labels, }) return output def collate( self, annotations: List[AnnotatedDataType], should_verify: bool = False, descriptions: Optional[List[str]] = None, train_data: Optional[AnnotatedDataType] = None ) -> AnnotatedDataType: ''' args: ``annotations``: List[AnnotatedDataType] given a series of annotations, collate them into a single series of annotations per instance ''' if should_verify: # make sure the annotations are in the # proper format Collator.verify_annotations(annotations) train_data_np = None if train_data: # if train data specified, will be used by Snorkel to estimate class balanc train_data_np, word_lists, id_to_labels = self.collate_np( [train_data]) train_data_np = train_data_np.astype(int) train_data_np = train_data_np.reshape(-1) collate_np, word_lists, id_to_labels = self.collate_np(annotations) self.train_label_model(collated_labels=collate_np, descriptions=descriptions, train_data_np=train_data_np) y_train_probs = self.get_probabilistic_labels( collated_labels=collate_np, ) tags = self.convert_to_tags(y_train_probs, word_list=word_lists, id_to_labels=id_to_labels) return tags
def train_model(args): #global args #args = parser.parse_args() hidden_size = 128 num_classes = 2 encode_dim = 108 # using get_frm_output_size() if (torch.cuda.is_available()): device = torch.device('cuda:0') #device = 'cuda' else: device = 'cpu' #print(device) L, Y = load_labels(args) # Label Model # labelling functions analysis print(lf_summary(L["dev"], Y=Y["dev"])) # majority vote of LFs mv = MajorityLabelVoter(seed=123) print('Majority Label Voter Metrics:') mv.score((L["dev"], Y["dev"]), metric=['accuracy', 'precision', 'recall', 'f1']) # training label model - no temporal modelling label_model = LabelModel(k=num_classes, seed=123) label_model.train_model(L["train"], Y["dev"], n_epochs=500, log_train_every=50) # evaluating label model print('Trained Label Model Metrics:') label_model.score((L["dev"], Y["dev"]), metric=['accuracy', 'precision', 'recall', 'f1']) # training label model without temporal modelling # naive model #print(L["train"].todense().shape) # (18850,5) #print(L["dev"].todense().shape) # (1500,5) #print(Y["dev"].shape) # (1500,) m_per_task = L["train"].todense().shape[1] # 5 MRI_data_naive = { 'Li_train': torch.FloatTensor(np.array(L["train"].todense().astype('int_'))), 'Li_dev': torch.FloatTensor(np.array(L["dev"].todense())), 'R_dev': Y["dev"] } MRI_data_naive['class_balance'] = torch.FloatTensor([0.5, 0.5]).to(device) # training naive model naive_model = DPLabelModel( m=m_per_task, T=1, edges=[], coverage_sets=[[ 0, ]] * m_per_task, mu_sharing=[[ i, ] for i in range(m_per_task)], phi_sharing=[], device=device, #class_balance=MRI_data_naive['class_balance'], seed=0) optimize(naive_model, L_hat=MRI_data_naive['Li_train'], num_iter=300, lr=1e-3, momentum=0.8, clamp=True, seed=0) # evaluating naive model R_pred = naive_model.predict(MRI_data_naive['Li_dev']).data.numpy() R_pred = 2 - R_pred #print(R_pred) #print(MRI_data_naive['R_dev']) for metric in ['accuracy', 'f1', 'recall', 'precision']: score = metric_score(MRI_data_naive['R_dev'], R_pred, metric) print(f"{metric.capitalize()}: {score:.3f}") # training label model with temporal modelling # reshaping dataset num_frames = 50 n_patients_train = round(L["train"].todense().shape[0] / num_frames) #(377) n_patients_dev = round(L["dev"].todense().shape[0] / num_frames) #(30) Ltrain = np.reshape(np.array(L["train"].todense()), (n_patients_train, num_frames, -1)) Ldev = np.reshape(np.array(L["dev"].todense()), (n_patients_dev, num_frames, -1)) Ydev = np.reshape(Y["dev"], (n_patients_dev, num_frames)) # print(Ltrain.shape) # (377,50,5) #print(Ldev.shape) # (30,50,5) #print(Ydev.shape) # (30,50) # subsampling # selecting frames 3,13,23,33,43 indices = np.linspace(2, 42, 5).astype(int) m_per_task = 5 T = 5 Ltrain_small = Ltrain[:, indices, :] # shape (377,5,5) Ldev_small = Ldev[:, indices, :] # shape (30,5,5) Ydev_small = Ydev[:, indices] # shape (30,5) Ltrain_small = np.reshape( Ltrain_small, ((n_patients_train * T), m_per_task)) # shape (1885,5) Ldev_small = np.reshape( Ldev_small, ((n_patients_dev * T), m_per_task)) # shape (150,5) Ydev_small = np.reshape(Ydev_small, ((n_patients_dev * T), )) # shape (150,) MRI_data_temporal = { 'Li_train': torch.LongTensor(Ltrain_small).view(n_patients_train, (m_per_task * T)), 'Li_dev': torch.LongTensor(Ldev_small).view(n_patients_dev, (m_per_task * T)), 'R_dev': torch.LongTensor(Ydev_small)[::T] * (2**T - 1), 'm': m_per_task * T, 'T': T } MRI_data_temporal['class_balance'] = normalize( (MRI_data_temporal['R_dev'].unsqueeze(1) == torch.arange( 2**T, device=device).unsqueeze(0)).sum(0).float(), dim=0, p=1) max_seed = 10 temporal_models = [ None, ] * max_seed for seed in range(max_seed): markov_model = DPLabelModel( m=m_per_task * T, T=T, edges=[(i, i + m_per_task) for i in range((T - 1) * m_per_task)], coverage_sets=[[ t, ] for t in range(T) for _ in range(m_per_task)], mu_sharing=[[t * m_per_task + i for t in range(T)] for i in range(m_per_task)], phi_sharing=[[(t * m_per_task + i, (t + 1) * m_per_task + i) for t in range(T - 1)] for i in range(m_per_task)], device=device, class_balance=MRI_data_temporal['class_balance'], seed=seed) optimize(markov_model, L_hat=MRI_data_temporal['Li_train'], num_iter=1000, lr=1e-5, momentum=0.8, clamp=True, verbose=False, seed=seed) temporal_models[seed] = markov_model for seed, model in enumerate(temporal_models): R_pred = model.predict(MRI_data_temporal['Li_dev'].cpu()) F1 = metric_score(MRI_data_temporal['R_dev'].cpu() > 0, R_pred.cpu() > 0, 'f1') accuracy = metric_score(MRI_data_temporal['R_dev'].cpu(), R_pred.cpu(), 'accuracy') print(f"seed={seed} accuracy={accuracy:.3f} F1={F1:.3f}")
# In this section, we are going to train a generative model. This model is designed to estimate the best class a training point belongs to given a set of label functions ($P(\hat{Y} \mid \lambda_{1\ldots n})$). This is accomplished by estimating a parameter called mu, which is the probability of a label function emitting a 1 or 2 given the true class. ($P(\lambda_{j} = y | Y=y)$). Once mu is estimated, the final step is to use mu and calculate the above probability ($P(\hat{Y} \mid \lambda_{1\ldots n})$). In order to find the best model, we will use hyperparameter optimization. This process involves trying multiple values keeping the value that provides the best result. The challenge with this process is that the search space is vast and can take non trivial time to navigate. As a consequence we resort to using advance methods called [bayesian optimization](https://en.wikipedia.org/wiki/Bayesian_optimization) to navigate this space in an effective way. The hyperparameter we are tuning is the l2 norm. This penalty term penalizes a machine learning classifier for the magnitude of the weights. By using this parameter we optimize the l2 norm for each label function individually. Luckily, the generative model is pretty fast to train, so this process won't take a significant amount of time. # # Distant Supervision # Here in this section we are using the distant superivion paradigm to label our candidate sentences. # ## Grid Search # In[18]: regularization_grid = pd.np.round(pd.np.linspace(0.01, 5, num=15), 2) # In[19]: grid_results = {} label_model = LabelModel(k=2) for param in tqdm_notebook(regularization_grid): label_model.train_model(correct_L[:, 0:7], n_epochs=1000, print_every=200, seed=100, lr=0.01, l2=param) grid_results[str(param)] = label_model.predict_proba(correct_L_train[:, 0:7]) # In[20]: acc_results = defaultdict(list) for key in grid_results:
Ls.append(L_test) Ds.append(D_test) print(lf_summary(Ls[1], Y=Ys[1])) balance = sorted(Counter(Y_test).items()) balance2 = Counter(Y_test).values() new_balance = [] for elem in balance: new_balance.append(elem[1] / sum(balance2)) print(sorted(Counter(Y_test).items())) print(balance) print(new_balance) label_model = LabelModel(k=2, seed=123) label_model.train_model(Ls[0], class_balance=new_balance, n_epochs=500, log_train_every=50) score = label_model.score((Ls[1], Ys[1])) print('Trained Label Model Metrics:') scores = label_model.score((Ls[1], Ys[1]), metric=['accuracy', 'precision', 'recall', 'f1']) mv = MajorityLabelVoter(seed=123) print('Majority Label Voter Metrics:') scores = mv.score((Ls[1], Ys[1]), metric=['accuracy', 'precision', 'recall', 'f1'])
parallelism=PARALLEL) L_train = labeler.get_label_matrices(train_cands) L_gold_train = labeler.get_gold_labels(train_cands, annotator="gold") from metal import analysis analysis.lf_summary( L_train[0], lf_names=labeler.get_keys(), Y=L_gold_train[0].todense().reshape(-1).tolist()[0], ) from metal.label_model import LabelModel gen_model = LabelModel(k=2) gen_model.train_model(L_train[0], n_epochs=500, verbose=False) train_marginals = gen_model.predict_proba(L_train[0]) from fonduer.learning import LogisticRegression disc_model = LogisticRegression() disc_model.train((train_cands[0], F_train[0]), train_marginals, n_epochs=10, lr=0.001) from my_fonduer_model import MyFonduerModel model = MyFonduerModel()
transformed_data = np.zeros(data.shape, dtype=np.int) transformed_data[data == -1] = 1 transformed_data[data == 1] = 2 return transformed_data train_ground = remap_labels(loader.train_ground) val_ground = remap_labels(loader.val_ground) L_train_sparse = sparse.csc_matrix( (remap_labels(L_train_sparse.data), L_train_sparse.indices, L_train_sparse.indptr)).T L_val_sparse = sparse.csc_matrix((remap_labels(L_val_sparse.data), L_val_sparse.indices, L_val_sparse.indptr)).T print('\n\n####### Running METAL Label Model ########') label_model = LabelModel() label_model.train_model(L_train_sparse, n_epochs=200, print_every=50, seed=123, verbose=False) train_marginals = label_model.predict_proba(L_train_sparse) label_model.score((L_train_sparse, train_ground), metric=metrics) ####### METAL with Exact Class Balance ######## print( '\n\n####### Running METAL Label Model with exact class balance ########') train_class_balance = np.array([ np.sum(train_ground == 1) / loader.train_num, np.sum(train_ground == 2) / loader.train_num ])
"DaG_TEXT": dg_text, "CtD_DB_TEXT": cd_db + cd_text, "CtD_ALL": cd_db + cd_text + cd_bicluster, "All_the_jawns": cd_db + cd_text + cd_bicluster + cg_text + dg_text } # In[ ]: L = label_matricies['train'].toarray() L[L < 0] = 2 L_dev = label_matricies['dev'].toarray() L_dev[L_dev < 0] = 2 L_test = label_matricies['test'].toarray() L_test[L_test < 0] = 2 label_model = LabelModel(k=2, seed=100) # In[ ]: reg_param_grid = pd.np.round(pd.np.linspace(1e-1, 1, num=30), 3) grid_results = defaultdict(dict) for model in tqdm_notebook(model_dict): for reg_param in reg_param_grid: label_model.train(L[:, model_dict[model]], n_epochs=1000, verbose=False, lr=0.01, l2=reg_param) grid_results[model][str(reg_param)] = label_model.predict_proba( L_dev[:, model_dict[model]])[:, 0]
def train_model(args): #global args #args = parser.parse_args() hidden_size = 128 num_classes = 2 encode_dim = 1000 # using get_frm_output_size() L,Y = load_labels(args) # Label Model # labelling functions analysis print(lf_summary(L["dev"], Y = Y["dev"])) # training label model label_model = LabelModel(k=num_classes, seed=123) label_model.train_model(L["train"], Y["dev"], n_epochs = 500, log_train_every = 50) # evaluating label model print('Trained Label Model Metrics:') label_model.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1']) # comparison with majority vote of LFs mv = MajorityLabelVoter(seed=123) print('Majority Label Voter Metrics:') mv.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1']) Ytrain_p = label_model.predict_proba(L["train"]) #print(Ytrain_ps.shape) #(377*50,2) #Ydev_p = label_model.predict_proba(L["dev"]) # test models #label_model.score((Ltest,Ytest), metric=['accuracy','precision', 'recall', 'f1']) # End Model # Create datasets and dataloaders train, dev, test = load_dataset(args, Ytrain_p, Y["dev"], Y["test"]) data_loader = get_data_loader(train, dev, test, args.batch_size, args.num_workers) #print(len(data_loader["train"])) # 18850 / batch_size #print(len(data_loader["dev"])) # 1500 / batch_size #print(len(data_loader["test"])) # 1000 / batch_size #import ipdb; ipdb.set_trace() # Define input encoder cnn_encoder = FrameEncoderOC if(torch.cuda.is_available()): device = 'cuda' else: device = 'cpu' #import ipdb; ipdb.set_trace() # Define LSTM module lstm_module = LSTMModule( encode_dim, hidden_size, bidirectional=False, verbose=False, lstm_reduction="attention", encoder_class=cnn_encoder, ) train_args = [data_loader["train"]] train_kwargs = { 'seed':args.seed, 'progress_bar':True, 'log_train_every':1} init_args = [ [hidden_size, num_classes] ] init_kwargs = { "input_module": lstm_module, "optimizer": "adam", "verbose": False, "input_batchnorm": True, "use_cuda":torch.cuda.is_available(), 'checkpoint_dir':args.checkpoint_dir, 'seed':args.seed, 'device':device} search_space = { 'n_epochs':[10], 'batchnorm':[True], 'dropout': [0.1,0.25,0.4], 'lr':{'range': [1e-3, 1e-2], 'scale': 'log'}, 'l2':{'range': [1e-5, 1e-4], 'scale': 'log'},#[ 1.21*1e-5], #'checkpoint_metric':['f1'], } log_config = { "log_dir": "./run_logs", "run_name": 'cnn_lstm_oc' } max_search = 5 tuner_config = {"max_search": max_search } validation_metric = 'accuracy' # Set up logger and searcher tuner = RandomSearchTuner(EndModel, **log_config, log_writer_class=TensorBoardWriter, validation_metric=validation_metric, seed=1701) disc_model = tuner.search( search_space, valid_data = data_loader["dev"], train_args=train_args, init_args=init_args, init_kwargs=init_kwargs, train_kwargs=train_kwargs, max_search=tuner_config["max_search"], clean_up=False, ) # evaluate end model disc_model.score(data_loader["dev"], verbose=True, metric=['accuracy','precision', 'recall', 'f1'])
labeler.apply(split=0, lfs=[president_name_pob_lfs], train=True, parallelism=PARALLEL) L_train = labeler.get_label_matrices(train_cands) L_gold_train = labeler.get_gold_labels(train_cands, annotator="gold") from metal import analysis analysis.lf_summary( L_train[0], lf_names=labeler.get_keys(), Y=L_gold_train[0].todense().reshape(-1).tolist()[0], ) from metal.label_model import LabelModel gen_model = LabelModel(k=2) gen_model.train_model(L_train[0], n_epochs=500, print_every=100) train_marginals = gen_model.predict_proba(L_train[0]) from fonduer.learning import LogisticRegression disc_model = LogisticRegression() disc_model.train((train_cands[0], F_train[0]), train_marginals, n_epochs=10, lr=0.001) from my_fonduer_model import MyFonduerModel model = MyFonduerModel() import fonduer_model fonduer_model.save_model( fonduer_model=model,
'categorical') L_test = convert_labels(label_matricies['test'].toarray(), 'plusminus', 'categorical') validation_data = list( zip([L[:, :7], L[:, :24], L], [L_dev[:, :7], L_dev[:, :24], L_dev])) test_data = list( zip([L[:, :7], L[:, :24], L], [L_test[:, :7], L_test[:, :24], L_test])) model_labels = ["Distant Supervision (DS)", "DS+User Defined Rules", "All"] # In[15]: model_grid_search = {} for model_data, model_label in zip(validation_data, model_labels): label_model = LabelModel(k=2, seed=100) grid_results = {} for param in regularization_grid: label_model.train_model(model_data[0], n_epochs=1000, verbose=False, lr=0.01, l2=param) grid_results[str(param)] = label_model.predict_proba(model_data[1])[:, 0] model_grid_search[model_label] = pd.DataFrame.from_dict(grid_results) # In[16]: model_grid_aucs = {}