def load_external_labels(session, candidate_class, split, annotator_name='gold'): gold_labels = pd.read_csv(FPATH, sep="\t") # Get split candidates candidates = session.query(candidate_class).filter( candidate_class.split == split).all() for index, row in gold_labels.iterrows(): # We check if the label already exists, in case this cell was already executed context_stable_ids = "~~".join([row['virus'], row['host']]) query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) # If label doesn't exist, add label to the session if query.count() == 0: session.add( StableLabel(context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator_name, split=split, filter_label_split=False)
def load_external_labels(session, candidate_class, annotator_name='gold'): gold_labels = pd.read_csv(FPATH, sep="\t") for index, row in gold_labels.iterrows(): # We check if the label already exists, in case this cell was already executed context_stable_ids = "~~".join([row['person1'], row['person2']]) query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # Because it's a symmetric relation, load both directions... context_stable_ids = "~~".join([row['person2'], row['person1']]) query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
def load_external_labels(session, candidate_class, split, annotator='gold', label_fname='data/cdr_relations_gold.pkl', id_fname='data/doc_ids.pkl'): # Load document-level relation annotations with open(label_fname, 'rb') as f: relations = load(f) # Get split candidates candidates = session.query(candidate_class).filter( candidate_class.split == split ).all() for c in candidates: # Get the label by mapping document annotations to mentions doc_relations = relations.get(c.get_parent().get_parent().name, set()) label = 2 * int(c.get_cids() in doc_relations) - 1 # Get stable ids and check to see if label already exits context_stable_ids = '~~'.join(x.get_stable_id() for x in c) query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids ) query = query.filter(StableLabel.annotator_name == annotator) # If does not already exist, add label if query.count() == 0: session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator, value=label )) # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator, split=split, filter_label_split=False)
def reload_external_labels(session: SnorkelSession, input_file: Union[str, Path], annotator_name: str = "gold"): Education = get_candidate_class() with open(str(input_file), "r") as f: lbls = ujson.load(f) for lbl in lbls: # we check if the label already exists, in case this cell was already executed context_stable_ids = "~~".join((lbl['person'], lbl['organization'])) query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add( StableLabel(context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=lbl['value'])) # commit session session.commit() # reload annotator labels reload_annotator_labels(session, Education, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, Education, annotator_name, split=2, filter_label_split=False)
def load_external_labels(session, candidate_class, annotator_name='gold',file_path=None,isPrint=True): # inherited from tutorial/intro/util.py gold_labels = pd.read_csv(file_path, sep="\t") for index, row in gold_labels.iterrows(): # if row['label'].strip()==annotator_name: # We check if the label already exists, in case this cell was already executed # print(row['segment'],row['label']) context_stable_ids = row['segment'] if isPrint: print(context_stable_ids) query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
def load_external_labels(session, candidate_class, annotator_name='gold'): gold_labels = pd.read_csv(FPATH, delimiter='\t', encoding='utf-8') for index, row in gold_labels.iterrows(): # We check if the label already exists, in case this cell was already executed context_stable_ids = "~~".join([row['cell']]) # print(index, context_stable_ids) # print(StableLabel.context_stable_ids) query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add( StableLabel(context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) print(index) # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
def load_external_trend_labels(session, candidate_class, annotator_name='gold'): gold_labels = pd.read_csv(FPATH, sep="\t") for index, row in gold_labels.iterrows(): # We check if the label already exists, in case this cell was already executed context_stable_ids = row['tr'] #print(context_stable_ids) query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) #print(query.count()) # for x in query.: # print(x) # print(query.all()) if query.count() == 0: print('********************************') print('adding gold labels for this row') print(row) session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) else: print('----------------------------') print('stable label is found for this one!: ') print(row) # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
def load_external_labels(session, candidate_class, annotator_name='gold'): gold_labels = pd.read_csv(FPATH, sep="\t") for index, row in gold_labels.iterrows(): session.add(StableLabel(tweet = row['content'], annotator_name = annotator_name, value = row['label'])) # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
def load_external_labels(session, candidate_class, annotator_name='gold'): gold_labels = pd.read_csv(FPATH, sep="\t") counter = 0 #print(session.query(StableLabel).filter(StableLabel.context_stable_ids.label.im_self)[:20]) for index, row in gold_labels.iterrows(): # We check if the label already exists, in case this cell was already executed context_stable_ids = row['Features'] #print(row['Features']) #print(row['label']) ''' session.add(StableLabel( idx=index, context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) ''' query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add( StableLabel(context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) ''' # Because it's a symmetric relation, load both directions... (it is for persons only) context_stable_ids = row['Features'] query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) ''' # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
def load_external_labels(session, candidate_class, split, annotator='gold', label_fname='data/cdr_relations_gold.pkl', id_fname='data/doc_ids.pkl'): # Load document-level relation annotations with open(label_fname, 'rb') as f: relations = load(f) # Get split candidates candidates = session.query(candidate_class).filter( candidate_class.split == split).all() for c in candidates: # Get the label by mapping document annotations to mentions doc_relations = relations.get(c.get_parent().get_parent().name, set()) label = 2 * int(c.get_cids() in doc_relations) - 1 # Get stable ids and check to see if label already exits context_stable_ids = '~~'.join(x.get_stable_id() for x in c) query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator) # If does not already exist, add label if query.count() == 0: session.add( StableLabel(context_stable_ids=context_stable_ids, annotator_name=annotator, value=label)) # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator, split=split, filter_label_split=False)
def load_external_labels(session, candidate_class, tsv_path, annotator_name='gold', symmetric=False, reload = False, filter_label_split = False, debug=True): # FPATH = 'data/gold_labels.tsv' """ Adapted from snorkel/tutorials/workshop/lib/load_external_annotations.py reload: Boolean:: Whether to reload annotations (perform mapping for splits 0,1,2) """ gold_labels = pd.read_csv(tsv_path, sep="\t") # TODO: delete {DEBUG} for index, row in gold_labels.iterrows(): # We check if the label already exists, in case this cell was already executed context_stable_ids = "~~".join([row['Chemical'], row['Gene']]) query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # If it's a symmetric relation, load both directions... if symmetric: for index, row in gold_labels.iterrows(): context_stable_ids = "~~".join([row['Gene'], row['Chemical']]) query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # Commit session session.commit() # Reload annotator labels if reload: reload_annotator_labels(session, candidate_class, annotator_name, split=0, filter_label_split= filter_label_split, debug=debug) reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split= filter_label_split, debug=debug) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split= filter_label_split, debug=debug)
def load_external_labels(session, candidate_class, column1_title, column2_title, filepath, candidates, annotator_name='gold'): gold_labels = pd.read_csv(filepath, sep="\t") for index, row in gold_labels.iterrows(): # We check if the label already exists, in case this cell was already executed context_stable_ids = "~~".join( [row[column1_title], row[column2_title]]) query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) # print context_stable_ids if query.count() == 0: session.add( StableLabel(context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # Because it's a symmetric relation, load both directions... context_stable_ids = "~~".join( [row[column1_title], row[column2_title]]) query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add( StableLabel(context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) for c in candidates: print c.biomarker.get_stable_id() print c candidate_label = c[0].get_stable_id() + "~~" + c[1].get_stable_id() query = session.query(StableLabel).filter( StableLabel.context_stable_ids == candidate_label) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add( StableLabel(context_stable_ids=candidate_label, annotator_name=annotator_name, value=-1)) # Commit session session.commit() # Reload annotator labels reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
def load_external_labels(session, candidate_class, tsv_path, annotator_name='gold', symmetric=False, reload = False): # FPATH = 'data/gold_labels.tsv' """ Adapted from snorkel/tutorials/workshop/lib/load_external_annotations.py reload: Boolean:: Whether to reload annotations (perform mapping for splits 0,1,2) """ gold_labels = pd.read_csv(tsv_path, sep="\t") # TODO: delete {DEBUG} for index, row in gold_labels.iterrows(): # We check if the label already exists, in case this cell was already executed context_stable_ids = "~~".join([row['Chemical'], row['Gene']]) query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # If it's a symmetric relation, load both directions... if symmetric: for index, row in gold_labels.iterrows(): context_stable_ids = "~~".join([row['Gene'], row['Chemical']]) query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add(StableLabel( context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=row['label'])) # Commit session session.commit() # Reload annotator labels if reload: reload_annotator_labels(session, candidate_class, annotator_name, split=0, filter_label_split= filter_label_split, debug=debug) reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split= filter_label_split, debug=debug) reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split= filter_label_split, debug=debug) ####################################################### ### Load from pickle dictionary (on document level) ### ####################################################### ### From snorkel/tutorials/cdr/load_external_annotations.py (v0.6.2) # from six.moves.cPickle import load # from snorkel.db_helpers import reload_annotator_labels # from snorkel.models import StableLabel # import bz2 # def load_external_labels(session, candidate_class, split, annotator='gold', # label_fname='data/cdr_relations_gold.pkl', id_fname='data/doc_ids.pkl'): # # Load document-level relation annotations # if label_fname.endswith('.bz2'): # with bz2.BZ2File(label_fname, 'rb') as f: # relations = load(f) # else: # with open(label_fname, 'rb') as f: # relations = load(f) # # Get split candidates # candidates = session.query(candidate_class).filter( # candidate_class.split == split # ).all() # for c in candidates: # # Get the label by mapping document annotations to mentions # doc_relations = relations.get(c.get_parent().get_parent().name, set()) # label = 2 * int(c.get_cids() in doc_relations) - 1 # # Get stable ids and check to see if label already exits # context_stable_ids = '~~'.join(x.get_stable_id() for x in c) # query = session.query(StableLabel).filter( # StableLabel.context_stable_ids == context_stable_ids # ) # query = query.filter(StableLabel.annotator_name == annotator) # # If does not already exist, add label # if query.count() == 0: # session.add(StableLabel( # context_stable_ids=context_stable_ids, # annotator_name=annotator, # value=label # )) # # Commit session # session.commit() # # Reload annotator labels # reload_annotator_labels(session, candidate_class, annotator, split=split, filter_label_split=False)