def load_snorkel(): filename = 'snorkel_model' gms = [] for i in range(6): gm = GenerativeModel() gm.load(filename + str(i)) gms.append(gm) return gms
def __init__(self, *args, **kwargs): super(SnorkelAgent, self).__init__(*args, **kwargs) #TODO: load model # self.models = np.load(filename)['m'].item() gms = [] for i in range(6): gm = GenerativeModel() gm.load(filename + str(i)) gms.append(gm) self.models = gms
def train_snorkel_gen_model(L, gte=True): L_train = sparse.csr_matrix(L) gen_model = GenerativeModel() gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.01 / L_train.shape[0], reg_param=1e-6) train_marginals = gen_model.marginals(L_train) marginals_threshold = (max(train_marginals) - min(train_marginals)) / 2 train_labels = (2 * (train_marginals >= marginals_threshold) - 1 if gte else 2 * (train_marginals < marginals_threshold) - 1) return gen_model, train_labels, train_marginals
class Snorkeller: def __init__(self, query_pairwise_bins_by_ranker: QueryPairwiseBinsByRanker): self.query_pairwise_bins_by_ranker = query_pairwise_bins_by_ranker self.snorkel_gm = GenerativeModel() self.is_trained = False def train(self, train_ranked_lists_by_ranker: Dict[str, List[List[int]]]): L_train = get_L_from_rankings(train_ranked_lists_by_ranker) ds = DependencySelector() deps = ds.select(L_train, threshold=0.0) self.snorkel_gm.train(L_train, deps, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6) self.is_trained = True def calc_marginals(self, target_info: List[TargetInfo]): non_rand_target_info: List[TargetInfo] = [] deltas: List[int] = [] delta_idxs: List[int] = [] for idx, info in enumerate(target_info): if info[3]: deltas.append(deltas[-1] + 1 if len(deltas) != 0 else 0) delta_idxs.append(idx) else: non_rand_target_info.append(info) offset = len(non_rand_target_info) order = [] marginal_ctr = 0 for idx in range(len(target_info)): if len(delta_idxs) != 0 and idx == delta_idxs[0]: order.append(offset + deltas[0]) deltas = deltas[1:] delta_idxs = delta_idxs[1:] else: order.append(marginal_ctr) marginal_ctr += 1 order = np.array(order) L = get_L_from_pairs(self.query_pairwise_bins_by_ranker, non_rand_target_info) marginals = self.snorkel_gm.marginals(L) all_marginals = np.concatenate([ marginals, np.ones(len(target_info) - len(marginals), dtype=marginals.dtype) ]) return all_marginals[order]
def Fitting_Gen_Model(L_train): gen_model = GenerativeModel() gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6) #------------------------- print(gen_model.weights.lf_accuracy) print(gen_model.weights.class_prior) #------------------------- #We now apply the generative model to the training candidates to get the noise-aware training label set. We'll refer to these as the training marginals: train_marginals = gen_model.marginals(L_train) return gen_model, train_marginals
def __init__( self, positive_label: str, class_cardinality: int = 2, num_epochs: int = 500, log_train_every: int = 50, seed: int = 123, threshold: float = 0.5, ): self.positive_label = positive_label self.class_cardinality = class_cardinality self.num_epochs = num_epochs self.log_train_every = log_train_every self.seed = seed self.ds = DependencySelector() self.gen_model = GenerativeModel(lf_propensity=True) self.threshold = threshold
def main(): '''Simple function to bootstrap a game. Use this as an example to set up your training env. ''' # Print all possible environments in the Pommerman registry print(pommerman.REGISTRY) # Create a set of agents (exactly four) agent_list = [ agents.SimpleAgent(), agents.SimpleAgent(), agents.SimpleAgent(), agents.SimpleAgent(), # agents.DockerAgent("pommerman/simple-agent", port=12345), ] # Make the "Free-For-All" environment using the agent list env = pommerman.make('PommeTeamCompetition-v0', agent_list) d = [] # Run the episodes just like OpenAI Gym for i_episode in range(300): state = env.reset() done = False while not done: # env.render() cur_obs = env.get_observations() actions = env.act(state) for ob, act in zip(cur_obs, actions): val = np.zeros(6) val[act] = 1 d.append([ob, val]) state, reward, done, info = env.step(actions) print('Episode {} finished'.format(i_episode)) env.close() lf = get_lf() rows = len(d) L = np.zeros([6, rows, len(lf)]) for r in range(rows): for i, f in enumerate(lf): L[:, r, i] = f(d[r][0]) gms = [] for i in range(6): gms.append(GenerativeModel()) # TODO: add ground labels to training filename = 'snorkel_model' for i, gm in enumerate(gms): temp_l = np.squeeze(L[i, :, :]).astype(int) gm.train(temp_l) gm.save(filename + str(i))
def train_generative_model(data_matrix, burn_in=10, epochs=100, reg_param=1e-6, step_size=0.001, deps=[], lf_propensity=False): """ This function is desgned to train the generative model data_matrix - the label function matrix which contains the output of all label functions burnin - number of burn in iterations epochs - number of epochs to train the model reg_param - how much regularization is needed for the model step_size - how much of the gradient will be used during training deps - add dependencey structure if necessary lf_propensity - boolean variable to determine if model should model the likelihood of a label function return a fully trained model """ model = GenerativeModel(lf_propensity=lf_propensity) model.train( data_matrix, epochs=epochs, burn_in=burn_in, reg_param=reg_param, step_size=step_size, reg_type=2 ) return model
def train_gen_model(predicate_resume, parallelism=8): logging.info("Start train gen") session = SnorkelSession() labeler = _get_labeler(predicate_resume) logging.info("Load matrix") L_train = _load_matrix(predicate_resume, session, labeler) gen_model = GenerativeModel() logging.info("Train model") gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6, threads=int(parallelism)) logging.info("Save model") _save_model(predicate_resume, gen_model) #Save marginals logging.info("Get marginals") train_marginals = gen_model.marginals(L_train) logging.info("Save marginals") save_marginals(session, L_train, train_marginals)
def train_gen_model(self,deps=False,grid_search=False): """ Calls appropriate generative model """ if self.has_snorkel: #TODO: GridSearch from snorkel.learning import GenerativeModel from snorkel.learning import RandomSearch from snorkel.learning.structure import DependencySelector gen_model = GenerativeModel() gen_model.train(self.L_train, epochs=100, decay=0.001 ** (1.0 / 100), step_size=0.005, reg_param=1.0) else: gen_model = LabelAggregator() gen_model.train(self.L_train, rate=1e-3, mu=1e-6, verbose=False) self.gen_model = gen_model
def score_gen_model(predicate_resume, session, gen_model_name=None, parallelism=16): if gen_model_name is None: model_name = "G" + predicate_resume["predicate_name"] + "Latest" logging.info("Stats logging") key_group = predicate_resume["label_group"] train_cids_query = get_train_cids_with_span(predicate_resume, session) L_train = load_ltrain(predicate_resume, session) gen_model = GenerativeModel() gen_model.load(model_name) gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6) logging.info(gen_model.weights.lf_accuracy) print(gen_model.weights.lf_accuracy) train_marginals = gen_model.marginals(L_train) fig = plt.figure() #hist=plt.hist(train_marginals, bins=20) #plt.savefig("plt"+strftime("%d-%m-%Y_%H_%M_%S", gmtime())+".png", dpi=fig.dpi) gen_model.learned_lf_stats()
def apply_GenMod(L_train): """ Applies generative model on label matrix :param L_train: Label matrix :return: None """ gen_model = GenerativeModel() # gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6) gen_model.train(L_train, cardinality=3) # print(gen_model.weights.lf_accuracy) train_marginals = gen_model.marginals(L_train) report.append('\n#Gen Model Stats\n') report.append(gen_model.learned_lf_stats().to_csv(sep=' ', index=False, header=True)) save_marginals(session, L_train, train_marginals)
# In[ ]: print "Total Data Shape:" print L_train.shape print # # Train the Generative Model # Here is the first step of classification step of this project, where we train a gnerative model to discriminate the correct label each candidate will receive. Snorkel's generative model uses a Gibbs Sampling on a [factor graph](http://deepdive.stanford.edu/assets/factor_graph.pdf), to generate the probability of a potential candidate being a true candidate (label of 1). # In[ ]: from snorkel.learning import GenerativeModel gen_model = GenerativeModel() get_ipython().magic( u'time gen_model.train(L_train, epochs=10, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6, threads=50, verbose=True)' ) # In[ ]: get_ipython().magic(u'time train_marginals = gen_model.marginals(L_train)') # In[ ]: gen_model.learned_lf_stats() # In[ ]: plt.hist(train_marginals, bins=20)
def score_lfs(predicate_resume, L_gold_test, session, date_time, parallelism=8): dump_file_path = "./results/" + "lfs_1_" + predicate_resume[ "predicate_name"] + date_time + ".csv" key_group = predicate_resume["label_group"] LFs = get_labelling_functions(predicate_resume) labeler = LabelAnnotator(lfs=LFs) test_cids_query = get_test_cids_with_span(predicate_resume, session) L_test = labeler.apply(parallelism=parallelism, cids_query=test_cids_query, key_group=key_group, clear=True, replace_key_set=False) data_frame = L_test.lf_stats(session) print(data_frame) logging.info(data_frame) data_frame.to_csv(dump_file_path) gen_model = GenerativeModel() gen_model.train(L_test, epochs=100, decay=0.95, step_size=0.1 / L_test.shape[0], reg_param=1e-6) p, r, f1 = gen_model.score(L_test, L_gold_test) print("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1)) logging.info("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format( p, r, f1)) dump_file_path1 = "./results/" + "test_gen_1_" + predicate_resume[ "predicate_name"] + date_time + ".csv" with open(dump_file_path1, 'w+b') as f: writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL) writer.writerow(["Precision", "Recall", "F1"]) writer.writerow( ["{0:.3f}".format(p), "{0:.3f}".format(r), "{0:.3f}".format(f1)]) test_marginals = gen_model.marginals(L_test) dump_file_path2 = "./results/" + "plt_1_" + predicate_resume[ "predicate_name"] + date_time + ".csv" #plt.hist(test_marginals, bins=20) #plt.savefig(dump_file_path2) #plt.show() dump_file_path3 = "./results/" + "gen_2_" + predicate_resume[ "predicate_name"] + date_time + ".csv" data_frame3 = gen_model.learned_lf_stats() data_frame3.to_csv(dump_file_path3) dump_file_path4 = "./results/" + "gen_3_" + predicate_resume[ "predicate_name"] + date_time + ".csv" tp, fp, tn, fn = gen_model.error_analysis(session, L_test, L_gold_test) with open(dump_file_path4, 'w+b') as f: writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL) writer.writerow(["TP", "FP", "TN", "FN"]) writer.writerow( [str(len(tp)), str(len(fp)), str(len(tn)), str(len(fn))]) dump_file_path5 = "./results/" + "gen_4_" + predicate_resume[ "predicate_name"] + date_time + ".csv" data_frame4 = L_test.lf_stats(session, L_gold_test, gen_model.learned_lf_stats()['Accuracy']) data_frame4.to_csv(dump_file_path5)
missed = load_external_labels(session, VirusHost, annotator_name='gold', split=1) L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1) missed = load_external_labels(session, VirusHost, annotator_name='gold', split=2) L_gold_test = load_gold_labels(session, annotator_name='gold', split=2) # Generative model ds = DependencySelector() deps = ds.select(L_train, threshold=0.1) gen_model = GenerativeModel() gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1.00e-03, deps=deps) train_marginals = gen_model.marginals(L_train) # Discriminative model featurizer = FeatureAnnotator(f=hybrid_span_mention_ftrs) F_train = featurizer.load_matrix(session, split=0) F_dev = featurizer.load_matrix(session, split=1) F_test = featurizer.load_matrix(session, split=2)
# Executing query for eval candidates eval_cands = session.query(candidate_class).filter( candidate_class.split == eval_split).order_by(candidate_class.id).all() print(f'Loaded {len(eval_cands)} candidates...') # Applying LFs print("Applying LFs...") from snorkel.annotations import LabelAnnotator labeler = LabelAnnotator(lfs=LFs) L_eval = labeler.apply(split=eval_split, parallelism=parallelism) # defining model from snorkel.learning import GenerativeModel # Creating generative model gen_model = GenerativeModel() # defining saved weights directory and name model_name = 'Price_Gen_20K' # this was provided when the model was saved! save_dir = '/dfs/scratch0/jdunnmon/data/memex-data/extractor_checkpoints/Price_Gen_20K' # this was provided when the model was saved! # loading print("Loading generative model...") gen_model.load(model_name=model_name, save_dir=save_dir, verbose=True) # Evaluating LSTM print("Evaluating marginals...") eval_marginals = gen_model.marginals(L_eval) # Geocoding from gm_utils import create_extractions_dict
def __init__(self, query_pairwise_bins_by_ranker: QueryPairwiseBinsByRanker): self.query_pairwise_bins_by_ranker = query_pairwise_bins_by_ranker self.snorkel_gm = GenerativeModel() self.is_trained = False
cand_extractor.apply(sents) print("Number of candidates:", session.query(pairs).count()) labeler = LabelAnnotator(lfs=LFs) L_train = labeler.apply() print(L_train.lf_stats(session)) # generative model, training_marginals are probabilistic training labels gen_model = GenerativeModel() gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6) print(gen_model.weights.lf_accuracy) train_marginals = gen_model.marginals(L_train) plt.hist(train_marginals, bins=20) plt.show() print(gen_model.learned_lf_stats()) #L_dev = labeler.apply_existing()
class SnorkelCollator(Collator): def __init__( self, positive_label: str, class_cardinality: int = 2, num_epochs: int = 500, log_train_every: int = 50, seed: int = 123, threshold: float = 0.5, ): self.positive_label = positive_label self.class_cardinality = class_cardinality self.num_epochs = num_epochs self.log_train_every = log_train_every self.seed = seed self.ds = DependencySelector() self.gen_model = GenerativeModel(lf_propensity=True) self.threshold = threshold @classmethod def get_snorkel_index(cls, tag: str) -> int: if is_positive(tag): return 1 elif is_negative(tag): return 0 else: return -1 def get_tag(self, index: int) -> str: if index == 1: return self.positive_label else: return NEGATIVE_LABEL def get_index(self, prob: np.ndarray) -> str: assert prob.shape == (2, ) return prob.argmax() def collate_np(self, annotations) -> Tuple[np.ndarray, List[str], List[int]]: output_arrs: List[np.ndarray] = [] words_list: List[str] = [] id_to_labels: Dict[int, Tuple[int, int]] = {} num_funcs = len(annotations) for i, ann_inst in tqdm(enumerate(zip(*annotations))): ids = [inst['id'] for inst in ann_inst] inputs = [inst['input'] for inst in ann_inst] outputs = [inst['output'] for inst in ann_inst] input_len = len(inputs[0]) entry_id = ids[0] # output arr = (sentence x num_labels) output_arr = np.zeros((input_len, num_funcs)) for i, output in enumerate(outputs): for j, out_j in enumerate(output): output_arr[j, i] = SnorkelCollator.get_snorkel_index(out_j) label_start = len(words_list) for word_i, word in enumerate(inputs[0]): words_list.append(word) output_arrs.append(output_arr) label_end = len(words_list) id_to_labels[entry_id] = (label_start, label_end) output_res = np.concatenate(output_arrs, axis=0) return output_res, words_list, id_to_labels def train_label_model( self, collated_labels: np.ndarray, descriptions: Optional[List[str]], train_data_np: Optional[np.ndarray], ): sparse_labels = sparse.csr_matrix(collated_labels.astype(int)) if descriptions is not None: descriptions = [(i, desc) for i, desc in enumerate(descriptions)] logger.warn(f'labeling function order: {descriptions}') deps = self.ds.select(sparse_labels, threshold=0.05) self.gen_model.train( sparse_labels, deps=deps, decay=0.95, step_size=0.1 / sparse_labels.shape[0], reg_param=0.0, cardinality=self.class_cardinality, ) def get_probabilistic_labels(self, collated_labels: np.ndarray) -> np.ndarray: sparse_labels = sparse.csr_matrix(collated_labels) return self.gen_model.marginals(sparse_labels) def convert_to_tags( self, train_probs: np.ndarray, word_list: List[str], id_to_labels: Dict[int, Tuple[int, int]], ) -> List[AnnotatedDataType]: output = [] for entry_id, (label_start, label_end) in id_to_labels.items(): words = word_list[label_start:label_end] prob_labels = train_probs[label_start:label_end] if self.class_cardinality == 2: # (m, ) marginals in prob labels label_ids = (prob_labels > self.threshold).astype(int) else: # (m, k) marginals in prob labels label_ids = prob_labels.argmax(axis=1) labels = [self.get_tag(i) for i in label_ids] output.append({ 'id': entry_id, 'input': words, 'output': labels, }) return output def collate( self, annotations: List[AnnotatedDataType], should_verify: bool = False, descriptions: Optional[List[str]] = None, train_data: Optional[AnnotatedDataType] = None ) -> AnnotatedDataType: ''' args: ``annotations``: List[AnnotatedDataType] given a series of annotations, collate them into a single series of annotations per instance ''' if should_verify: # make sure the annotations are in the # proper format Collator.verify_annotations(annotations) train_data_np = None if train_data: # if train data specified, will be used by Snorkel to estimate class balanc train_data_np, word_lists, id_to_labels = self.collate_np( [train_data]) train_data_np = train_data_np.astype(int) train_data_np = train_data_np.reshape(-1) collate_np, word_lists, id_to_labels = self.collate_np(annotations) self.train_label_model(collated_labels=collate_np, descriptions=descriptions, train_data_np=train_data_np) y_train_probs = self.get_probabilistic_labels( collated_labels=collate_np, ) tags = self.convert_to_tags(y_train_probs, word_list=word_lists, id_to_labels=id_to_labels) return tags
dg_text = get_columns(session, L_train, DG_LFS, "DaG_TEXT") # In[ ]: # This block defines a list of label function columns defined above lfs_columns = [cg_text] # This block specifies the labels for the above label function columns model_names = ["CbG_TEXT"] # In[ ]: indep_models = [] for columns in lfs_columns: #Conditionally independent Generative Model indep_gen_model = GenerativeModel() indep_gen_model.train( L_train[:, columns], epochs=10, decay=0.95, step_size=0.1 / L_train[:, columns].shape[0], reg_param=1e-6, threads=50, ) indep_models.append(indep_gen_model) # In[ ]: dep_models = [] for columns in lfs_columns: # select the dependancies from the label matrix
loader.val_object_height, loader.val_ground, print_stats_table=True) metrics = ["accuracy", "precision", "recall", "f1"] # ####### Majority Vote ######## # mv_labels = np.sign(np.sum(L.T,1)) # print ('Coverage of Majority Vote on Train Set: ', np.sum(np.sign(np.sum(np.abs(L.T),1)) != 0)/float(loader.train_num)) # print ('Accuracy of Majority Vote on Train Set: ', np.sum(mv_labels == loader.train_ground)/float(loader.train_num)) ######################## ####### Snorkel ######## ######################## print('\n\n\n####### Running Snorkel Generative Model ########') gen_model = GenerativeModel() gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.01 / L_train.shape[0], reg_param=1e-6) print(gen_model.score(L_train_sparse, loader.train_ground)) ###################### ####### METAL ######## ###################### # remap labels so that they are in {1, 2} def remap_labels(data): transformed_data = np.zeros(data.shape, dtype=np.int)
# Labeling Function Performance - Coverage, Overlaps, Conflicts L_train_BC.lf_stats(session) L_train_BD.lf_stats(session) L_train_BM.lf_stats(session) L_train_BT.lf_stats(session) # Analyzing Dependencies Ldeps = [] for L in [L_train_BC, L_train_BD, L_train_BD, L_train_BD]: ds = DependencySelector() deps = ds.select(L, threshold=0.1) len(deps) Ldeps.append(deps) gen_model = GenerativeModel(lf_propensity=True) gen_model.train(L_train, deps=deps, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=0.0) train_marginals = gen_model.marginals(L_train) plt.hist(train_marginals, bins=20) plt.show() gen_model.learned_lf_stats() save_marginals(session, L_train, train_marginals) load_external_labels(session, BiomarkerCondition, 'Biomarker', 'Condition', 'articles/disease_gold_labels.tsv',
LFs = [ LF_time, LF_date, LF_location, LF_person, LF_org, LF_url, LF_phone, LF_product, LF_event, LF_email, LF_address, LF_job, LF_file, LF_file_before ] labeler = LabelAnnotator(lfs=LFs) np.random.seed(1701) L_train = labeler.apply(split=0) print L_train.lf_stats(session, ) L_train.todense() from snorkel.learning import GenerativeModel gen_model = GenerativeModel() gen_model.train(L_train, cardinality=3) train_marginals = gen_model.marginals(L_train) # assert np.all(train_marginals.sum(axis=1) - np.ones(3) < 1e-10) # train_marginals from snorkel.annotations import save_marginals, load_marginals save_marginals(session, L_train, train_marginals) from snorkel.annotations import FeatureAnnotator featurizer = FeatureAnnotator() F_train = featurizer.apply(split=0)
u'time L_train_BC = BC_labeler.load_matrix(session, split=0)') L_train_BC # In[ ]: L_train_BC.get_candidate(session, 0) # In[ ]: L_train_BC.get_key(session, 0) # In[ ]: from snorkel.learning import GenerativeModel gen_model = GenerativeModel() gen_model.train(L_train_BC, epochs=100, decay=0.95, step_size=0.1 / L_train_BC.shape[0], reg_param=1e-6) # In[ ]: gen_model.weights.lf_accuracy # In[ ]: train_marginals = gen_model.marginals(L_train_BC) # In[ ]: