Ejemplo n.º 1
0
def load_external_labels(session,
                         candidate_class,
                         split,
                         annotator_name='gold'):
    gold_labels = pd.read_csv(FPATH, sep="\t")

    # Get split candidates
    candidates = session.query(candidate_class).filter(
        candidate_class.split == split).all()

    for index, row in gold_labels.iterrows():

        # We check if the label already exists, in case this cell was already executed
        context_stable_ids = "~~".join([row['virus'], row['host']])
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)

        # If label doesn't exist, add label to the session
        if query.count() == 0:
            session.add(
                StableLabel(context_stable_ids=context_stable_ids,
                            annotator_name=annotator_name,
                            value=row['label']))

    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session,
                            candidate_class,
                            annotator_name,
                            split=split,
                            filter_label_split=False)
Ejemplo n.º 2
0
def load_external_labels(session, candidate_class, annotator_name='gold'):
    gold_labels = pd.read_csv(FPATH, sep="\t")
    for index, row in gold_labels.iterrows():    

        # We check if the label already exists, in case this cell was already executed
        context_stable_ids = "~~".join([row['person1'], row['person2']])
        query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))
                    
        # Because it's a symmetric relation, load both directions...
        context_stable_ids = "~~".join([row['person2'], row['person1']])
        query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))

    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False)
    reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
def load_external_labels(session, candidate_class, split, annotator='gold',
    label_fname='data/cdr_relations_gold.pkl', id_fname='data/doc_ids.pkl'):
    # Load document-level relation annotations
    with open(label_fname, 'rb') as f:
        relations = load(f)
    # Get split candidates
    candidates = session.query(candidate_class).filter(
        candidate_class.split == split
    ).all()
    for c in candidates:
        # Get the label by mapping document annotations to mentions
        doc_relations = relations.get(c.get_parent().get_parent().name, set())
        label = 2 * int(c.get_cids() in doc_relations) - 1        
        # Get stable ids and check to see if label already exits
        context_stable_ids = '~~'.join(x.get_stable_id() for x in c)
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == context_stable_ids
        )
        query = query.filter(StableLabel.annotator_name == annotator)
        # If does not already exist, add label
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator,
                value=label
            ))

    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session, candidate_class, annotator,
                            split=split, filter_label_split=False)
def load_external_labels(session, candidate_class, annotator_name='gold'):
    gold_labels = pd.read_csv(FPATH, sep="\t")
    for index, row in gold_labels.iterrows():    

        # We check if the label already exists, in case this cell was already executed
        context_stable_ids = "~~".join([row['person1'], row['person2']])
        query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))
                    
        # Because it's a symmetric relation, load both directions...
        context_stable_ids = "~~".join([row['person2'], row['person1']])
        query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))

    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False)
    reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
Ejemplo n.º 5
0
def reload_external_labels(session: SnorkelSession,
                           input_file: Union[str, Path],
                           annotator_name: str = "gold"):
    Education = get_candidate_class()
    with open(str(input_file), "r") as f:
        lbls = ujson.load(f)

    for lbl in lbls:
        # we check if the label already exists, in case this cell was already executed
        context_stable_ids = "~~".join((lbl['person'], lbl['organization']))
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(
                StableLabel(context_stable_ids=context_stable_ids,
                            annotator_name=annotator_name,
                            value=lbl['value']))

    # commit session
    session.commit()

    # reload annotator labels
    reload_annotator_labels(session,
                            Education,
                            annotator_name,
                            split=1,
                            filter_label_split=False)
    reload_annotator_labels(session,
                            Education,
                            annotator_name,
                            split=2,
                            filter_label_split=False)
Ejemplo n.º 6
0
def load_external_labels(session, candidate_class, annotator_name='gold',file_path=None,isPrint=True):
    # inherited from tutorial/intro/util.py
    gold_labels = pd.read_csv(file_path, sep="\t")
    for index, row in gold_labels.iterrows(): 

        # if row['label'].strip()==annotator_name: 
            # We check if the label already exists, in case this cell was already executed
        # print(row['segment'],row['label'])
        context_stable_ids = row['segment']
        if isPrint:
            print(context_stable_ids)
        query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))

    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False)
    reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
Ejemplo n.º 7
0
def load_external_labels(session, candidate_class, annotator_name='gold'):
    gold_labels = pd.read_csv(FPATH, delimiter='\t', encoding='utf-8')
    for index, row in gold_labels.iterrows():
        # We check if the label already exists, in case this cell was already executed
        context_stable_ids = "~~".join([row['cell']])

        # print(index, context_stable_ids)
        # print(StableLabel.context_stable_ids)
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)

        if query.count() == 0:

            session.add(
                StableLabel(context_stable_ids=context_stable_ids,
                            annotator_name=annotator_name,
                            value=row['label']))

    print(index)
    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session,
                            candidate_class,
                            annotator_name,
                            split=1,
                            filter_label_split=False)
    reload_annotator_labels(session,
                            candidate_class,
                            annotator_name,
                            split=2,
                            filter_label_split=False)
Ejemplo n.º 8
0
def load_external_trend_labels(session, candidate_class, annotator_name='gold'):
    gold_labels = pd.read_csv(FPATH, sep="\t")
    for index, row in gold_labels.iterrows():    

        # We check if the label already exists, in case this cell was already executed
        context_stable_ids = row['tr']
        #print(context_stable_ids)
        query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        #print(query.count())
#         for x in query.:
#            print(x)
#         print(query.all())
        if query.count() == 0:
            print('********************************')
            print('adding gold labels for this row')
            print(row)
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))
        else:
            print('----------------------------')
            print('stable label is found for this one!: ')
            print(row)
            
            
                    
    # Commit session
    session.commit()
    # Reload annotator labels
    reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False)
    reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
Ejemplo n.º 9
0
def load_external_labels(session, candidate_class, annotator_name='gold'):
    gold_labels = pd.read_csv(FPATH, sep="\t")
    for index, row in gold_labels.iterrows():
        session.add(StableLabel(tweet = row['content'], annotator_name = annotator_name, value = row['label']))
    
    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split=False)
    reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split=False)
Ejemplo n.º 10
0
def load_external_labels(session, candidate_class, annotator_name='gold'):
    gold_labels = pd.read_csv(FPATH, sep="\t")
    counter = 0
    #print(session.query(StableLabel).filter(StableLabel.context_stable_ids.label.im_self)[:20])
    for index, row in gold_labels.iterrows():

        # We check if the label already exists, in case this cell was already executed
        context_stable_ids = row['Features']
        #print(row['Features'])
        #print(row['label'])
        '''
        session.add(StableLabel(
                idx=index,
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))
        
        '''
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(
                StableLabel(context_stable_ids=context_stable_ids,
                            annotator_name=annotator_name,
                            value=row['label']))
        '''
        # Because it's a symmetric relation, load both directions... (it is for persons only)
        context_stable_ids = row['Features']
        query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))
        '''

    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session,
                            candidate_class,
                            annotator_name,
                            split=1,
                            filter_label_split=False)
    reload_annotator_labels(session,
                            candidate_class,
                            annotator_name,
                            split=2,
                            filter_label_split=False)
Ejemplo n.º 11
0
def load_external_labels(session,
                         candidate_class,
                         split,
                         annotator='gold',
                         label_fname='data/cdr_relations_gold.pkl',
                         id_fname='data/doc_ids.pkl'):
    # Load document-level relation annotations
    with open(label_fname, 'rb') as f:
        relations = load(f)
    # Get split candidates
    candidates = session.query(candidate_class).filter(
        candidate_class.split == split).all()
    for c in candidates:
        # Get the label by mapping document annotations to mentions
        doc_relations = relations.get(c.get_parent().get_parent().name, set())
        label = 2 * int(c.get_cids() in doc_relations) - 1
        # Get stable ids and check to see if label already exits
        context_stable_ids = '~~'.join(x.get_stable_id() for x in c)
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator)
        # If does not already exist, add label
        if query.count() == 0:
            session.add(
                StableLabel(context_stable_ids=context_stable_ids,
                            annotator_name=annotator,
                            value=label))

    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session,
                            candidate_class,
                            annotator,
                            split=split,
                            filter_label_split=False)
Ejemplo n.º 12
0
def load_external_labels(session, candidate_class, tsv_path, annotator_name='gold', symmetric=False, reload = False, filter_label_split = False, debug=True):
    # FPATH = 'data/gold_labels.tsv'
    """
    Adapted from snorkel/tutorials/workshop/lib/load_external_annotations.py
    
    reload: Boolean:: Whether to reload annotations (perform mapping for splits 0,1,2)
    """
    gold_labels = pd.read_csv(tsv_path, sep="\t") # TODO: delete {DEBUG}
    for index, row in gold_labels.iterrows():

        # We check if the label already exists, in case this cell was already executed
        context_stable_ids = "~~".join([row['Chemical'], row['Gene']])
        query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))

    # If it's a symmetric relation, load both directions...
    if symmetric:
        for index, row in gold_labels.iterrows():    
            context_stable_ids = "~~".join([row['Gene'], row['Chemical']])
            query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
            query = query.filter(StableLabel.annotator_name == annotator_name)
            if query.count() == 0:
                session.add(StableLabel(
                    context_stable_ids=context_stable_ids,
                    annotator_name=annotator_name,
                    value=row['label']))

    # Commit session
    session.commit()

    # Reload annotator labels
    if reload:
        reload_annotator_labels(session, candidate_class, annotator_name, split=0, filter_label_split= filter_label_split, debug=debug)
        reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split= filter_label_split, debug=debug)
        reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split= filter_label_split, debug=debug)
def load_external_labels(session,
                         candidate_class,
                         column1_title,
                         column2_title,
                         filepath,
                         candidates,
                         annotator_name='gold'):
    gold_labels = pd.read_csv(filepath, sep="\t")
    for index, row in gold_labels.iterrows():

        # We check if the label already exists, in case this cell was already executed
        context_stable_ids = "~~".join(
            [row[column1_title], row[column2_title]])
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        # print context_stable_ids
        if query.count() == 0:
            session.add(
                StableLabel(context_stable_ids=context_stable_ids,
                            annotator_name=annotator_name,
                            value=row['label']))

        # Because it's a symmetric relation, load both directions...
        context_stable_ids = "~~".join(
            [row[column1_title], row[column2_title]])
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(
                StableLabel(context_stable_ids=context_stable_ids,
                            annotator_name=annotator_name,
                            value=row['label']))
    for c in candidates:
        print c.biomarker.get_stable_id()
        print c
        candidate_label = c[0].get_stable_id() + "~~" + c[1].get_stable_id()
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == candidate_label)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(
                StableLabel(context_stable_ids=candidate_label,
                            annotator_name=annotator_name,
                            value=-1))

    # Commit session
    session.commit()

    # Reload annotator labels
    reload_annotator_labels(session,
                            candidate_class,
                            annotator_name,
                            split=1,
                            filter_label_split=False)
    reload_annotator_labels(session,
                            candidate_class,
                            annotator_name,
                            split=2,
                            filter_label_split=False)
Ejemplo n.º 14
0
def load_external_labels(session, candidate_class, tsv_path, annotator_name='gold', symmetric=False, reload = False):
    # FPATH = 'data/gold_labels.tsv'
    """
    Adapted from snorkel/tutorials/workshop/lib/load_external_annotations.py
    
    reload: Boolean:: Whether to reload annotations (perform mapping for splits 0,1,2)
    """
    gold_labels = pd.read_csv(tsv_path, sep="\t") # TODO: delete {DEBUG}
    for index, row in gold_labels.iterrows():

        # We check if the label already exists, in case this cell was already executed
        context_stable_ids = "~~".join([row['Chemical'], row['Gene']])
        query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(StableLabel(
                context_stable_ids=context_stable_ids,
                annotator_name=annotator_name,
                value=row['label']))

    # If it's a symmetric relation, load both directions...
    if symmetric:
        for index, row in gold_labels.iterrows():    
            context_stable_ids = "~~".join([row['Gene'], row['Chemical']])
            query = session.query(StableLabel).filter(StableLabel.context_stable_ids == context_stable_ids)
            query = query.filter(StableLabel.annotator_name == annotator_name)
            if query.count() == 0:
                session.add(StableLabel(
                    context_stable_ids=context_stable_ids,
                    annotator_name=annotator_name,
                    value=row['label']))

    # Commit session
    session.commit()

    # Reload annotator labels
    if reload:
        reload_annotator_labels(session, candidate_class, annotator_name, split=0, filter_label_split= filter_label_split, debug=debug)
        reload_annotator_labels(session, candidate_class, annotator_name, split=1, filter_label_split= filter_label_split, debug=debug)
        reload_annotator_labels(session, candidate_class, annotator_name, split=2, filter_label_split= filter_label_split, debug=debug)



#######################################################
### Load from pickle dictionary (on document level) ###
#######################################################
###  From snorkel/tutorials/cdr/load_external_annotations.py (v0.6.2)
# from six.moves.cPickle import load

# from snorkel.db_helpers import reload_annotator_labels
# from snorkel.models import StableLabel
# import bz2

# def load_external_labels(session, candidate_class, split, annotator='gold',
#     label_fname='data/cdr_relations_gold.pkl', id_fname='data/doc_ids.pkl'):
#     # Load document-level relation annotations
#     if label_fname.endswith('.bz2'):
#         with bz2.BZ2File(label_fname, 'rb') as f:
#             relations = load(f)
#     else:    
#         with open(label_fname, 'rb') as f:
#             relations = load(f)
#     # Get split candidates
#     candidates = session.query(candidate_class).filter(
#         candidate_class.split == split
#     ).all()
#     for c in candidates:
#         # Get the label by mapping document annotations to mentions
#         doc_relations = relations.get(c.get_parent().get_parent().name, set())
#         label = 2 * int(c.get_cids() in doc_relations) - 1        
#         # Get stable ids and check to see if label already exits
#         context_stable_ids = '~~'.join(x.get_stable_id() for x in c)
#         query = session.query(StableLabel).filter(
#             StableLabel.context_stable_ids == context_stable_ids
#         )
#         query = query.filter(StableLabel.annotator_name == annotator)
#         # If does not already exist, add label
#         if query.count() == 0:
#             session.add(StableLabel(
#                 context_stable_ids=context_stable_ids,
#                 annotator_name=annotator,
#                 value=label
#             ))

#     # Commit session
#     session.commit()

#     # Reload annotator labels
#     reload_annotator_labels(session, candidate_class, annotator, split=split, filter_label_split=False)