def load_external_labels(session, candidate_class, annotator_name='gold'): gold_labels = pd.read_csv(FPATH, sep="\t") for index, row in gold_labels.iterrows(): # We check if the label already exists, in case this cell was already executed context_stable_ids = "~~".join([row['person1'], row['person2']]) query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # Because it's a symmetric relation, load both directions... context_stable_ids = "~~".join([row['person2'], row['person1']]) query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
def load_external_labels(session, candidate_class, split, annotator_name='gold'): gold_labels = pd.read_csv(FPATH, sep="\t") # Get split candidates candidates = session.query(candidate_class).filter( candidate_class.split == split).all() for index, row in gold_labels.iterrows(): # We check if the label already exists, in case this cell was already executed context_stable_ids = "~~".join([row['virus'], row['host']]) query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) # If label doesn't exist, add label to the session if query.count() == 0: session.add( StableLabel(context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator_name, split=split, filter_label_split=False)
def reload_external_labels(session: SnorkelSession, input_file: Union[str, Path], annotator_name: str = "gold"): Education = get_candidate_class() with open(str(input_file), "r") as f: lbls = ujson.load(f) for lbl in lbls: # we check if the label already exists, in case this cell was already executed context_stable_ids = "~~".join((lbl['person'], lbl['organization'])) query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add( StableLabel(context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=lbl['value'])) # commit session session.commit() # reload annotator labels reload_annotator_labels(session, Education, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, Education, annotator_name, split=2, filter_label_split=False)
def load_external_labels(session, candidate_class, annotator_name='gold',file_path=None,isPrint=True): # inherited from tutorial/intro/util.py gold_labels = pd.read_csv(file_path, sep="\t") for index, row in gold_labels.iterrows(): # if row['label'].strip()==annotator_name: # We check if the label already exists, in case this cell was already executed # print(row['segment'],row['label']) context_stable_ids = row['segment'] if isPrint: print(context_stable_ids) query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
def load_external_labels(session, candidate_class, annotator_name='gold'): gold_labels = pd.read_csv(FPATH, delimiter='\t', encoding='utf-8') for index, row in gold_labels.iterrows(): # We check if the label already exists, in case this cell was already executed context_stable_ids = "~~".join([row['cell']]) # print(index, context_stable_ids) # print(StableLabel.context_stable_ids) query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add( StableLabel(context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) print(index) # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
def load_external_trend_labels(session, candidate_class, annotator_name='gold'): gold_labels = pd.read_csv(FPATH, sep="\t") for index, row in gold_labels.iterrows(): # We check if the label already exists, in case this cell was already executed context_stable_ids = row['tr'] #print(context_stable_ids) query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) #print(query.count()) # for x in query.: # print(x) # print(query.all()) if query.count() == 0: print('********************************') print('adding gold labels for this row') print(row) session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) else: print('----------------------------') print('stable label is found for this one!: ') print(row) # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
def load_external_labels(session, candidate_class, annotator_name='gold'): gold_labels = pd.read_csv(FPATH, sep="\t") for index, row in gold_labels.iterrows(): session.add(StableLabel(tweet = row['content'], annotator_name = annotator_name, value = row['label'])) # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
def load_external_labels(session, candidate_class, tsv_path, annotator_name='gold', symmetric=False, reload = False, filter_label_split = False, debug=True): # FPATH = 'data/gold_labels.tsv' """ Adapted from snorkel/tutorials/workshop/lib/load_external_annotations.py reload: Boolean:: Whether to reload annotations (perform mapping for splits 0,1,2) """ gold_labels = pd.read_csv(tsv_path, sep="\t") # TODO: delete {DEBUG} for index, row in gold_labels.iterrows(): # We check if the label already exists, in case this cell was already executed context_stable_ids = "~~".join([row['Chemical'], row['Gene']]) query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # If it's a symmetric relation, load both directions... if symmetric: for index, row in gold_labels.iterrows(): context_stable_ids = "~~".join([row['Gene'], row['Chemical']]) query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # Commit session session.commit() # Reload annotator labels if reload: reload_annotator_labels(session, candidate_class, annotator_name, split=0, filter_label_split= filter_label_split, debug=debug) reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split= filter_label_split, debug=debug) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split= filter_label_split, debug=debug)
def load_external_labels(session, candidate_class, annotator_name='gold'): gold_labels = pd.read_csv(FPATH, sep="\t") counter = 0 #print(session.query(StableLabel).filter(StableLabel.context_stable_ids.label.im_self)[:20]) for index, row in gold_labels.iterrows(): # We check if the label already exists, in case this cell was already executed context_stable_ids = row['Features'] #print(row['Features']) #print(row['label']) ''' session.add(StableLabel( idx=index, context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) ''' query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add( StableLabel(context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) ''' # Because it's a symmetric relation, load both directions... (it is for persons only) context_stable_ids = row['Features'] query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) ''' # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
def handle_label_event(self, _, content, buffers): """ Handles label event by persisting new label """ if content.get('event', '') == 'set_label': cid = content.get('cid', None) value = content.get('value', None) if value is True: value = 1 elif value is False: value = -1 else: raise ValueError('Unexpected label returned from widget: ' + str(value) + '. Expected values are True and False.') # If label already exists, just update value (in both AnnotatorLabel and StableLabel) if self.annotations[cid] is not None: if self.annotations[cid].value != value: self.annotations[cid].value = value self.annotations_stable[cid].value = value self.session.commit() # Otherwise, create a AnnotatorLabel *and a StableLabel* else: candidate = self.candidates[cid] # Create AnnotatorLabel self.annotations[cid] = GoldLabel(key=self.annotator, candidate=candidate, value=value) self.session.add(self.annotations[cid]) # Create StableLabel context_stable_ids = '~~'.join([c.stable_id for c in candidate.get_contexts()]) self.annotations_stable[cid] = StableLabel(context_stable_ids=context_stable_ids,\ annotator_name=self.annotator.name,\ value=value,\ split=candidate.split) self.session.add(self.annotations_stable[cid]) self.session.commit() elif content.get('event', '') == 'delete_label': cid = content.get('cid', None) self.session.delete(self.annotations[cid]) self.annotations[cid] = None self.session.delete(self.annotations_stable[cid]) self.annotations_stable[cid] = None self.session.commit()
def load_external_labels(session, candidate_class, split, annotator='gold', label_fname='data/cdr_relations_gold.pkl', id_fname='data/doc_ids.pkl'): # Load document-level relation annotations with open(label_fname, 'rb') as f: relations = load(f) # Get split candidates candidates = session.query(candidate_class).filter( candidate_class.split == split).all() for c in candidates: # Get the label by mapping document annotations to mentions doc_relations = relations.get(c.get_parent().get_parent().name, set()) label = 2 * int(c.get_cids() in doc_relations) - 1 # Get stable ids and check to see if label already exits context_stable_ids = '~~'.join(x.get_stable_id() for x in c) query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator) # If does not already exist, add label if query.count() == 0: session.add( StableLabel(context_stable_ids=context_stable_ids, annotator_name=annotator, value=label)) # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator, split=split, filter_label_split=False)
def load_external_labels(session, candidate_class, column1_title, column2_title, filepath, candidates, annotator_name='gold'): gold_labels = pd.read_csv(filepath, sep="\t") for index, row in gold_labels.iterrows(): # We check if the label already exists, in case this cell was already executed context_stable_ids = "~~".join( [row[column1_title], row[column2_title]]) query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) # print context_stable_ids if query.count() == 0: session.add( StableLabel(context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # Because it's a symmetric relation, load both directions... context_stable_ids = "~~".join( [row[column1_title], row[column2_title]]) query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add( StableLabel(context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) for c in candidates: print c.biomarker.get_stable_id() print c candidate_label = c[0].get_stable_id() + "~~" + c[1].get_stable_id() query = session.query(StableLabel).filter( StableLabel.context_stable_ids == candidate_label) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add( StableLabel(context_stable_ids=candidate_label, annotator_name=annotator_name, value=-1)) # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
def __init__(self, candidates, session, gold=[], n_per_page=3, height=225, annotator_name=None): """ Initializes a Viewer. The Viewer uses the keyword argument annotator_name to define a AnnotatorLabelKey with that name. :param candidates: A Python container of Candidates (e.g., not a CandidateSet, but candidate_set.candidates) :param session: The SnorkelSession for the database backend :param gold: Optional, Python container of Candidates that are know to have positive labels :param n_per_page: Optional, number of Contexts to display per page :param height: Optional, the height in pixels of the Viewer :param annotator_name: Name of the human using the Viewer, for saving their work. Defaults to system username. """ super(Viewer, self).__init__() self.session = session # By default, use the username as annotator name name = annotator_name if annotator_name is not None else getpass.getuser() # Sets up the AnnotationKey to use self.annotator = self.session.query(GoldLabelKey).filter(GoldLabelKey.name == name).first() if self.annotator is None: self.annotator = GoldLabelKey(name=name) session.add(self.annotator) session.commit() # Viewer display configs self.n_per_page = n_per_page self.height = height # Note that the candidates are not necessarily commited to the DB, so they *may not have* non-null ids # Hence, we index by their position in this list # We get the sorted candidates and all contexts required, either from unary or binary candidates self.gold = list(gold) self.candidates = sorted(list(candidates), key=lambda c : c[0].char_start) self.contexts = list(set(c[0].get_parent() for c in self.candidates + self.gold)) # If committed, sort contexts by id try: self.contexts = sorted(self.contexts, key=lambda c : c.id) except: pass # Loads existing annotations self.annotations = [None] * len(self.candidates) self.annotations_stable = [None] * len(self.candidates) init_labels_serialized = [] for i, candidate in enumerate(self.candidates): # First look for the annotation in the primary annotations table existing_annotation = self.session.query(GoldLabel) \ .filter(GoldLabel.key == self.annotator) \ .filter(GoldLabel.candidate == candidate) \ .first() if existing_annotation is not None: self.annotations[i] = existing_annotation if existing_annotation.value == 1: value_string = 'true' elif existing_annotation.value == -1: value_string = 'false' else: raise ValueError(str(existing_annotation) + ' has value not in {1, -1}, which Viewer does not support.') init_labels_serialized.append(str(i) + '~~' + value_string) # If the annotator label is in the main table, also get its stable version context_stable_ids = '~~'.join([c.stable_id for c in candidate.get_contexts()]) existing_annotation_stable = self.session.query(StableLabel) \ .filter(StableLabel.context_stable_ids == context_stable_ids)\ .filter(StableLabel.annotator_name == name).one_or_none() # If stable version is not available, create it here # NOTE: This is for versioning issues, should be removed? if existing_annotation_stable is None: context_stable_ids = '~~'.join([c.stable_id for c in candidate.get_contexts()]) existing_annotation_stable = StableLabel(context_stable_ids=context_stable_ids,\ annotator_name=self.annotator.name,\ split=candidate.split,\ value=existing_annotation.value) self.session.add(existing_annotation_stable) self.session.commit() self.annotations_stable[i] = existing_annotation_stable self._labels_serialized = ','.join(init_labels_serialized) # Configures message handler self.on_msg(self.handle_label_event) # display js, construct html and pass on to widget model self.render()
def load_external_labels(session, candidate_class, tsv_path, annotator_name='gold', symmetric=False, reload = False): # FPATH = 'data/gold_labels.tsv' """ Adapted from snorkel/tutorials/workshop/lib/load_external_annotations.py reload: Boolean:: Whether to reload annotations (perform mapping for splits 0,1,2) """ gold_labels = pd.read_csv(tsv_path, sep="\t") # TODO: delete {DEBUG} for index, row in gold_labels.iterrows(): # We check if the label already exists, in case this cell was already executed context_stable_ids = "~~".join([row['Chemical'], row['Gene']]) query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # If it's a symmetric relation, load both directions... if symmetric: for index, row in gold_labels.iterrows(): context_stable_ids = "~~".join([row['Gene'], row['Chemical']]) query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # Commit session session.commit() # Reload annotator labels if reload: reload_annotator_labels(session, candidate_class, annotator_name, split=0, filter_label_split= filter_label_split, debug=debug) reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split= filter_label_split, debug=debug) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split= filter_label_split, debug=debug) ####################################################### ### Load from pickle dictionary (on document level) ### ####################################################### ### From snorkel/tutorials/cdr/load_external_annotations.py (v0.6.2) # from six.moves.cPickle import load # from snorkel.db_helpers import reload_annotator_labels # from snorkel.models import StableLabel # import bz2 # def load_external_labels(session, candidate_class, split, annotator='gold', # label_fname='data/cdr_relations_gold.pkl', id_fname='data/doc_ids.pkl'): # # Load document-level relation annotations # if label_fname.endswith('.bz2'): # with bz2.BZ2File(label_fname, 'rb') as f: # relations = load(f) # else: # with open(label_fname, 'rb') as f: # relations = load(f) # # Get split candidates # candidates = session.query(candidate_class).filter( # candidate_class.split == split # ).all() # for c in candidates: # # Get the label by mapping document annotations to mentions # doc_relations = relations.get(c.get_parent().get_parent().name, set()) # label = 2 * int(c.get_cids() in doc_relations) - 1 # # Get stable ids and check to see if label already exits # context_stable_ids = '~~'.join(x.get_stable_id() for x in c) # query = session.query(StableLabel).filter( # StableLabel.context_stable_ids == context_stable_ids # ) # query = query.filter(StableLabel.annotator_name == annotator) # # If does not already exist, add label # if query.count() == 0: # session.add(StableLabel( # context_stable_ids=context_stable_ids, # annotator_name=annotator, # value=label # )) # # Commit session # session.commit() # # Reload annotator labels # reload_annotator_labels(session, candidate_class, annotator, split=split, filter_label_split=False)