def setUpClass(cls): # This is a hack to create a session to a different DB after Snorkel has # already been imported. It does not work in general because e.g., the UDF # constructor uses Snorkel's new_sessionmaker on different processes. # In general, the connection should still be set via the SNORKELDB # environment variable dir_path = os.path.dirname(os.path.realpath(__file__)) snorkel_engine = create_engine(os.path.join('sqlite:///' + dir_path, 'spouses.db')) SnorkelSession = sessionmaker(bind=snorkel_engine) cls.session = SnorkelSession() Spouse = candidate_subclass('Spouse', ['person1', 'person2']) cls.train_marginals = load_marginals(cls.session, split=0) cls.train_cands = cls.session.query(Spouse).filter(Spouse.split == 0).order_by(Spouse.id).all() cls.dev_cands = cls.session.query(Spouse).filter(Spouse.split == 1).order_by(Spouse.id).all() cls.test_cands = cls.session.query(Spouse).filter(Spouse.split == 2).order_by(Spouse.id).all() # Each candidate is featurized as 10 floats. The first five are between # -.25 and 1 if the class label is True and between -1 and .25 if False. # The remaining five are between -1 and 1. cls.F_train = load_feature_matrix(cls.session, split=0, coerce_int=False) cls.F_dev = load_feature_matrix(cls.session, split=1, coerce_int=False) cls.F_test = load_feature_matrix(cls.session, split=2, coerce_int=False) cls.L_gold_dev = load_gold_labels(cls.session, annotator_name='gold', split=1) cls.L_gold_test = load_gold_labels(cls.session, annotator_name='gold', split=2)
def __init__(self, name, data_path, lfs=None, verbose=True): super(CdrDataset, self).__init__(name, verbose) assert os.path.exists(data_path) os.environ['SNORKELDB'] = "sqlite:///{}".format(data_path) logger.info("SQL connection {}".format(os.environ['SNORKELDB'])) # Hack to prevent snorkel.db creation from snorkel import SnorkelSession from snorkel.models import candidate_subclass from snorkel.annotations import load_gold_labels, load_marginals self.session = SnorkelSession() self.class_type = candidate_subclass('ChemicalDisease', ['chemical', 'disease']) self.X_train = self.session.query( self.class_type).filter(self.class_type.split == 0).all() self.X_dev = self.session.query( self.class_type).filter(self.class_type.split == 1).all() self.X_test = self.session.query( self.class_type).filter(self.class_type.split == 2).all() if self.verbose: splits = { "Train": self.X_train, "Dev": self.X_dev, "Test": self.X_test } self.print_summary(splits) if len(self.X_train) == 0: logger.error("Fatal error - no candidates found in database") sys.exit() self.y_train = load_marginals(self.session, split=0) self.y_gold_train = load_gold_labels(self.session, annotator_name='gold', split=0) self.y_gold_dev = load_gold_labels(self.session, annotator_name='gold', split=1) self.y_gold_test = load_gold_labels(self.session, annotator_name='gold', split=2) if not lfs: # convert to 0/1 marginals self.y_train = (np.ravel(self.y_gold_train.toarray()) + 1.) / 2 self.y_dev = (np.ravel(self.y_gold_dev.toarray()) + 1.) / 2 self.y_test = (np.ravel(self.y_gold_test.toarray()) + 1.) / 2 else: self.y_train = self.y_train self.y_dev = (np.ravel(self.y_gold_dev.toarray()) + 1.) / 2 self.y_test = (np.ravel(self.y_gold_test.toarray()) + 1.) / 2
def Loading_sets(session, T1): # Generating and modeling noisy training labels # Using a labeled _development set_ # Loading the labels from both dev set and test set missed = load_external_labels(session, T1, annotator_name='gold') L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1) L_gold_test = load_gold_labels(session, annotator_name='gold', split=2) L_gold_train = load_gold_labels(session, annotator_name='gold', split=0) return L_gold_dev, L_gold_test, L_gold_train
def __init__(self, name, data_path, lfs=None): super(SpouseDataset, self).__init__(name) os.environ['SNORKELDB'] = "sqlite:///{}".format(data_path) logger.info("SQL connection {}".format(os.environ['SNORKELDB'])) # Hack to prevent snorkel.db creation from snorkel import SnorkelSession from snorkel.models import candidate_subclass from snorkel.annotations import load_gold_labels, load_marginals self.session = SnorkelSession() self.class_type = candidate_subclass('Spouse', ['person1', 'person2']) self.X_train = self.session.query( self.class_type).filter(self.class_type.split == 0).all() self.X_dev = self.session.query( self.class_type).filter(self.class_type.split == 1).all() self.X_test = self.session.query( self.class_type).filter(self.class_type.split == 2).all() if self.verbose: splits = { "Train": self.X_train, "Dev": self.X_dev, "Test": self.X_test } self.print_summary(splits) self.y_train = load_marginals(self.session, split=0) self.y_gold_train = load_gold_labels(self.session, annotator_name='gold', split=0) self.y_gold_dev = load_gold_labels(self.session, annotator_name='gold', split=1) self.y_gold_test = load_gold_labels(self.session, annotator_name='gold', split=2) if not lfs: # convert to 0/1 marginals self.y_train = (np.ravel(self.y_gold_train.toarray()) + 1.) / 2 self.y_dev = (np.ravel(self.y_gold_dev.toarray()) + 1.) / 2 self.y_test = (np.ravel(self.y_gold_test.toarray()) + 1.) / 2 else: self.y_train = self.y_train self.y_dev = (np.ravel(self.y_gold_dev.toarray()) + 1.) / 2 self.y_test = (np.ravel(self.y_gold_test.toarray()) + 1.) / 2
def test_LF(session, lf, split, annotator_name): """ Gets the accuracy of a single LF on a split of the candidates, w.r.t. annotator labels, and also returns the error buckets of the candidates. """ test_candidates = session.query(Candidate).filter(Candidate.split == split).all() test_labels = load_gold_labels(session, annotator_name=annotator_name, split=split) scorer = MentionScorer(test_candidates, test_labels) test_marginals = np.array([0.5 * (lf(c) + 1) for c in test_candidates]) return scorer.score(test_marginals, set_unlabeled_as_neg=False, set_at_thresh_as_neg=False)
def setUpClass(cls): # This is a hack to create a session to a different DB after Snorkel has # already been imported. It does not work in general because e.g., the UDF # constructor uses Snorkel's new_sessionmaker on different processes. # In general, the connection should still be set via the SNORKELDB # environment variable dir_path = os.path.dirname(os.path.realpath(__file__)) snorkel_engine = create_engine( os.path.join('sqlite:///' + dir_path, 'spouses.db')) SnorkelSession = sessionmaker(bind=snorkel_engine) cls.session = SnorkelSession() Spouse = candidate_subclass('Spouse', ['person1', 'person2']) cls.train_marginals = load_marginals(cls.session, split=0) cls.train_cands = cls.session.query(Spouse).filter( Spouse.split == 0).order_by(Spouse.id).all() cls.dev_cands = cls.session.query(Spouse).filter( Spouse.split == 1).order_by(Spouse.id).all() cls.test_cands = cls.session.query(Spouse).filter( Spouse.split == 2).order_by(Spouse.id).all() # Each candidate is featurized as 10 floats. The first five are between # -.25 and 1 if the class label is True and between -1 and .25 if False. # The remaining five are between -1 and 1. cls.F_train = load_feature_matrix(cls.session, split=0, coerce_int=False) cls.F_dev = load_feature_matrix(cls.session, split=1, coerce_int=False) cls.F_test = load_feature_matrix(cls.session, split=2, coerce_int=False) cls.L_gold_dev = load_gold_labels(cls.session, annotator_name='gold', split=1) cls.L_gold_test = load_gold_labels(cls.session, annotator_name='gold', split=2)
def get_gold_test_matrix(predicate_resume, session): candidate_subclass = predicate_resume["candidate_subclass"] brat = BratAnnotator(session, candidate_subclass, encoding='utf-8') test_cids_query = get_test_cids_with_span(predicate_resume, session) test_cands = get_test_cands_with_span(predicate_resume, session).all() brat.import_gold_labels(session, get_collection_name(predicate_resume, 2), test_cands, annotator_name="brat") L_gold_test = load_gold_labels(session, annotator_name="brat", cids_query=test_cids_query, split=2) return L_gold_test
def test_LF(session, lf, split, annotator_name): """ Gets the accuracy of a single LF on a split of the candidates, w.r.t. annotator labels, and also returns the error buckets of the candidates. """ test_candidates = session.query(Candidate).filter( Candidate.split == split).all() test_labels = load_gold_labels(session, annotator_name=annotator_name, split=split) scorer = MentionScorer(test_candidates, test_labels) test_marginals = np.array([0.5 * (lf(c) + 1) for c in test_candidates]) return scorer.score(test_marginals, set_unlabeled_as_neg=False, set_at_thresh_as_neg=False)
def test_LF(session, lf_arr, label_val, split=1, annotator_name='gold'): test_candidates = session.query(Candidate).filter( Candidate.split == split).all() test_labels = load_gold_labels(session, annotator_name='gold', split=split) test_marginals = np.array([(run_lf_arr_as_one(lf_arr, c)) for c in test_candidates]) test_label_array = [] tp = set() fp = set() tn = set() fn = set() b = 0.5 for i, candidate in enumerate(test_candidates): try: test_label_index = test_labels.get_row_index(candidate) test_label = test_labels[test_label_index, 0] except AttributeError: test_label = test_labels[i] # Bucket the candidates for error analysis test_label_array.append(test_label) if test_label != 0: if test_marginals[i] > b: if test_label == label_val: tp.add(candidate) else: fp.add(candidate) elif test_marginals[i] < b: if test_label != label_val: tn.add(candidate) else: fn.add(candidate) print_scores(len(tp), len(fp), len(tn), len(fn), title="Scores (Un-adjusted)") return tp, fp, tn, fn
dev_cands = session.query(VirusHost).filter(VirusHost.split == 1).order_by( VirusHost.id).all() test_cands = session.query(VirusHost).filter( VirusHost.split == 2).order_by(VirusHost.id).all() # Apply labeler to all sets L_train = labeler.apply(split=0) L_dev = labeler.apply(split=1) L_test = labeler.apply(split=2) # Load gold labels missed = load_external_labels(session, VirusHost, annotator_name='gold', split=1) L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1) missed = load_external_labels(session, VirusHost, annotator_name='gold', split=2) L_gold_test = load_gold_labels(session, annotator_name='gold', split=2) # Generative model ds = DependencySelector() deps = ds.select(L_train, threshold=0.1) gen_model = GenerativeModel() gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0],
dev_sentences_df = pd.read_excel("data/sentence-labels-dev.xlsx") dev_sentences_df = dev_sentences_df[dev_sentences_df.curated_dsh.notnull()] dev_sentences_df = dev_sentences_df.sort_values("candidate_id") dev_sentences_df.head(2) # In[ ]: sql = ''' SELECT candidate_id FROM gold_label INNER JOIN Candidate ON Candidate.id=gold_label.candidate_id WHERE Candidate.split=0; ''' cids = session.query(Candidate.id).filter( Candidate.id.in_([x[0] for x in session.execute(sql)])) L_train_labeled_gold = load_gold_labels(session, annotator_name='danich1', cids_query=cids) # In[ ]: train_candidate_ids = train_sentences_df.candidate_id.astype(int).tolist() dev_candidate_ids = (dev_sentences_df[ dev_sentences_df.curated_dsh.notnull()].candidate_id.astype(int).tolist()) # In[ ]: train_cands = (session.query(Candidate).filter( Candidate.id.in_(train_candidate_ids)).all()) train_label_cands = (session.query(Candidate).filter( Candidate.id.in_(cids)).all())
def __init__( self, conn_str, candidate_def, split=0, use_lfs=False, word_dict=None, pretrained_word_dict=None, max_seq_len=125, ): """ Assumes a Snorkel database that is fully instantiated with: - Candidates generated and assigned to train/dev/test splits - Labeling functions are applied and probabilistic labels are generated for train split(s) - Gold labels are stored in the database under 'annotator_name = gold' :param conn_str: :param candidate_def: :param split: :param use_lfs: :param word_dict: :param pretrained_word_dict: :param max_seq_len: """ if os.path.exists(conn_str): os.environ["SNORKELDB"] = "sqlite:///{}".format(conn_str) else: os.environ["SNORKELDB"] = "postgresql:///{}".format(conn_str) print("Connected to {}".format(os.environ["SNORKELDB"])) # defer imports until SNORKELDB is defined to prevent initalizing an empty sqlite instance from snorkel import SnorkelSession from snorkel.models import candidate_subclass, Candidate from snorkel.annotations import load_gold_labels, load_marginals # sqlite3 doesn't support multiple connections, so use a singleton-style connection object if not SnorkelDataset.session: SnorkelDataset.session = SnorkelSession() self.session = SnorkelDataset.session self.class_type = candidate_subclass(*candidate_def) self.cardinality = len(candidate_def[-1]) self.split = split self.max_seq_len = max_seq_len # create markup sequences and labels markers = [ m.format(i) for i in range(self.cardinality) for m in ["~~[[{}", "{}]]~~"] ] self.X = (self.session.query(Candidate).filter( Candidate.split == split).order_by(Candidate.id).all()) self.X = [self._mark_entities(x, markers) for x in self.X] # initalize vocabulary self.word_dict = (self._build_vocab(self.X, markers) if not word_dict else word_dict) if pretrained_word_dict: # include pretrained embedding terms self._include_pretrained_vocab(pretrained_word_dict, self.session.query(Candidate).all()) # initalize labels (from either LFs or gold labels) if use_lfs: self.Y = torch.tensor( load_marginals(self.session, split=split).todense()) else: self.Y = load_gold_labels(self.session, annotator_name="gold", split=split) self.Y = [int(y) for y in np.nditer(self.Y.todense())] # remap class labels to not include 0 (reserved by MeTaL) labels = { y: i + 1 for i, y in enumerate(sorted(np.unique(self.Y), reverse=1)) } self.Y = torch.tensor([labels[y] for y in self.Y])