Esempio n. 1
0
def reload_external_labels(session: SnorkelSession,
                           input_file: Union[str, Path],
                           annotator_name: str = "gold"):
    Education = get_candidate_class()
    with open(str(input_file), "r") as f:
        lbls = ujson.load(f)

    for lbl in lbls:
        # we check if the label already exists, in case this cell was already executed
        context_stable_ids = "~~".join((lbl['person'], lbl['organization']))
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(
                StableLabel(context_stable_ids=context_stable_ids,
                            annotator_name=annotator_name,
                            value=lbl['value']))

    # commit session
    session.commit()

    # reload annotator labels
    reload_annotator_labels(session,
                            Education,
                            annotator_name,
                            split=1,
                            filter_label_split=False)
    reload_annotator_labels(session,
                            Education,
                            annotator_name,
                            split=2,
                            filter_label_split=False)
def create_collection(predicate_resume, split):
    session = SnorkelSession()
    CandidateSubclass = predicate_resume["candidate_subclass"]
    if split == None or (split != 1 and split != 2):
        print("No split selected")
        logging.error("No split selected")
    cids_query = session.query(
        CandidateSubclass.id).filter(CandidateSubclass.split == split)
    brat = BratAnnotator(session, CandidateSubclass, encoding='utf-8')
    collection_name = get_collection_name(predicate_resume, split)
    brat.init_collection(collection_name, cid_query=cids_query)
    return collection_name
Esempio n. 3
0
def learn_generative(y_data):
    """
    Uses Snorkel to learn a generative model of the relative accuracies of LFs.
    It learns one generative model for each class, and combines them into a set of noisy labels
    """
    labels = [[], [], [], [], [], [], [], [], [], [], [], [], []]
    for ex in y_data:
        for i in range(0, 13):
            label_i = []
            for vote in ex:
                label_i.append(int(vote[i]))
            labels[i].append(np.array(label_i))
    labels = map(lambda x: np.array(x), labels)
    labels = np.array(labels)
    n_labels = []
    n_stats = []
    for i, class_lbl in enumerate(labels):
        print("learning generative model for label: {}".format(i))
        session = SnorkelSession()
        gen_model = GenerativeModel()
        gen_model.train(class_lbl,
                        epochs=100,
                        decay=0.95,
                        step_size=0.1 / class_lbl.shape[0],
                        reg_param=1e-6,
                        cardinality=2)
        train_marginals = gen_model.marginals(csr_matrix(class_lbl))
        n_labels.append(train_marginals)
        n_stats.append(gen_model.learned_lf_stats())
    for i, stats in enumerate(n_stats):
        stats.to_csv("./results/lf_stats/" + int_to_label[i],
                     sep=',',
                     encoding='utf-8')
    return np.array(n_labels).T
Esempio n. 4
0
def main():

    from snorkel import SnorkelSession
    session = SnorkelSession()

    import os
    from snorkel.parser import XMLMultiDocPreprocessor

    # The following line is for testing only. Feel free to ignore it.
    file_path = 'data/CDR.BioC.small.xml' if 'CI' in os.environ else 'data/CDR.BioC.xml'

    doc_preprocessor = XMLMultiDocPreprocessor(path=file_path,
                                               doc='.//document',
                                               text='.//passage/text/text()',
                                               id='.//id/text()')

    from snorkel.parser import CorpusParser
    from utils import TaggerOneTagger

    tagger_one = TaggerOneTagger()
    corpus_parser = CorpusParser(fn=tagger_one.tag)
    corpus_parser.apply(list(doc_preprocessor)[:100])
    # parsed result saved in session

    return doc_preprocessor, corpus_parser, session
Esempio n. 5
0
def main(args):

    session = SnorkelSession()

    # ---------------------------------------
    # 1: Split into blocks
    # ---------------------------------------
    split_pubtator_corpus(args.input_file, split_size=args.split_size)

    # ---------------------------------------
    # 2: Parse documents
    # ---------------------------------------
    filelist = glob.glob("{}.splits_{}/*".format(args.input_file,
                                                 args.split_size))

    # Iterate through the splits
    start_ts = time()
    for fp in filelist:
        doc_preprocessor = PubTatorDocPreprocessor(fp)
        parser = Spacy() if args.parser == "spacy" else StanfordCoreNLPServer()
        corpus_parser = CorpusParser(parser=parser)
        corpus_parser.apply(doc_preprocessor,
                            parallelism=args.num_procs,
                            clear=False)
        end_ts = time()
        print "Split completed in [%s]" % (time() - end_ts, )

    # pubtator_tags = PubTatorTagProcessor()
    # for fp in filelist:
    #     # load entity tags
    #     pubtator_tags.load_data(session, fp)

    print "\nDONE in [%s]" % (time() - start_ts, )
Esempio n. 6
0
 def __init__(self, name, version=0.1):
     """
     Create DB connection
     :param name: Database name
     """
     self.session = SnorkelSession()
     self.name = name
     self.version = version
     if os.path.isfile(self.name + '/output.db'):
         os.remove(self.name + '/output.db')
     self.conn = sqlite3.connect(self.name + '/output.db')
Esempio n. 7
0
def predicate_candidate_labelling(predicate_resume,
                                  parallelism=1,
                                  limit=None,
                                  replace_key_set=False):
    logging.info("Starting labeling ")
    session = SnorkelSession()
    try:
        candidate_subclass = predicate_resume["candidate_subclass"]
        key_group = predicate_resume["label_group"]

        cids_query = session.query(
            candidate_subclass.id).filter(candidate_subclass.split == 0)

        ##skip cands already extracted
        #alreadyExistsGroup=session.query(LabelKey).filter(LabelKey.group==key_group).count()>0
        #if alreadyExistsGroup:
        #    cids_query= get_train_cids_not_labeled(predicate_resume,session)

        #if limit !=None:
        #    cids_query=cids_query.filter(candidate_subclass.id<limit)

        LFs = get_labelling_functions(predicate_resume)

        labeler = LabelAnnotator(lfs=LFs)
        np.random.seed(1701)

        ##if first run or adding a new labeling functionS is needed to set replace key set to True
        #if not replace_key_set:
        #    replace_key_set=not alreadyExistsGroup
        L_train = labeler.apply(parallelism=parallelism,
                                cids_query=cids_query,
                                key_group=key_group,
                                clear=True,
                                replace_key_set=True)
        print(L_train.lf_stats(session))
        logging.info(L_train.lf_stats(session))

    finally:
        logging.info("Finished labeling ")
def train_disc_model(predicate_resume, parallelism=8):
    logging.info("Start training disc ")
    session = SnorkelSession()
    train_cids_query = get_train_cids_with_marginals_and_span(predicate_resume, session)
    logging.info("Loading marginals ")
    train_marginals = load_marginals(session, split=0, cids_query=train_cids_query)

    train_kwargs = {
        'lr':         0.01,
        'dim':        50,
        'n_epochs':   10,
        'dropout':    0.25,
        'print_freq': 1,
        'max_sentence_length': 100
    }

    logging.info("Querying train cands")
    candidate_subclass=predicate_resume["candidate_subclass"]
    train_cands = session.query(candidate_subclass).filter(candidate_subclass.split == 0).order_by(candidate_subclass.id).all()#get_train_cands_with_marginals_and_span(predicate_resume, session).all()
    logging.info("Querying dev cands")
    dev_cands = get_dev_cands_with_span(predicate_resume, session).all()
    logging.info("Querying gold labels")
    L_gold_dev = get_gold_dev_matrix(predicate_resume, session)
    logging.info("Training")
    lstm = reRNN(seed=1701, n_threads=int(parallelism))
    lstm.train(train_cands, train_marginals, **train_kwargs)
    logging.info("Saving")
    _save_model(predicate_resume, lstm)
    #test model
    candidate_subclass=predicate_resume["candidate_subclass"]
    test_cands  = session.query(candidate_subclass).filter(candidate_subclass.split == 2).order_by(candidate_subclass.id).all()
    L_gold_test = get_gold_test_matrix(predicate_resume,session)
    p, r, f1 = lstm.score(test_cands, L_gold_test)
    print("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1))
    logging.info("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1))
    lstm.save_marginals(session, test_cands)
Esempio n. 9
0
def extract_triples(predicate_resume, disc_model_name=None):
    date_time = strftime("%Y-%m-%d_%H_%M_%S", gmtime())
    session = SnorkelSession()
    if disc_model_name is None:
        disc_model_name = "D" + predicate_resume["predicate_name"] + "Latest"
    test_cands_query = get_test_cids_with_span(predicate_resume, session)

    test_cands = test_cands_query.all()
    lstm = reRNN()
    logging.info("Loading marginals ")
    lstm.load(disc_model_name)

    predictions = lstm.predictions(test_cands)
    dump_file_path3 = "./results/" + "triples_" + predicate_resume[
        "predicate_name"] + date_time + ".csv"

    subject_type = predicate_resume["subject_type"]
    object_type = predicate_resume["object_type"]
    subject_type_split = subject_type.split('/')
    object_type_split = object_type.split('/')
    subject_type_end = subject_type_split[len(subject_type_split) - 1]
    object_type_end = object_type_split[len(object_type_split) - 1]
    with open(dump_file_path3, 'w+b') as f:
        writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["text", "marginal", "prediction"])
        i = 0
        for c in test_cands:
            if predictions[i] == 1:
                subject_span = getattr(c, "subject").get_span()
                object_span = getattr(c, "object").get_span()
                subject_uri = get_dbpedia_node(subject_span, subject_type_end)
                object_uri = get_dbpedia_node(object_span, object_type_end)
                predicate_uri = predicate_resume["predicate_URI"]
                if subject_uri is not None and object_uri is not None:
                    row = [
                        str(subject_uri),
                        str(predicate_uri),
                        str(object_uri)
                    ]
                    writer.writerow(row)
            i = i + 1
Esempio n. 10
0
def parse_wikipedia_dump(
        dumps_folder_path='../../data/wikipedia/dump/en/extracted_text/AA/',
        clear=False,
        parallelism=8):

    logging.info("Corpus parsing start")
    session = SnorkelSession()

    corpus_parser = CorpusParser(parser=Spacy())
    onlyfiles = [
        f for f in listdir(dumps_folder_path)
        if isfile(join(dumps_folder_path, f))
    ]

    i = 0
    for file in onlyfiles:
        if file.endswith(".xml"):
            print file
            doc_preprocessor = XMLMultiDocPreprocessor(path=dumps_folder_path +
                                                       file,
                                                       doc='.//doc',
                                                       text='./text()',
                                                       id='./@title')
            if i > 0:
                clear = False
            try:
                corpus_parser.apply(doc_preprocessor,
                                    clear=clear,
                                    parallelism=parallelism)
            except IntegrityError as e:
                print("Already parsed " + file)
                logging.error("Already parsed " + file)
            i = i + 1
    #logging.debug("Documents: %d", session.query(Document).count())
    #logging.debug("Sentences: %d", session.query(Sentence).count())
    logging.info("Corpus parsing end")
Esempio n. 11
0
# In[ ]:

#Set up the environment
username = "******"
password = "******"
dbname = "pubmeddb"

#Path subject to change for different os
database_str = "postgresql+psycopg2://{}:{}@/{}?host=/var/run/postgresql".format(
    username, password, dbname)
os.environ['SNORKELDB'] = database_str

from snorkel import SnorkelSession

session = SnorkelSession()

# In[ ]:

from snorkel.annotations import LabelAnnotator, load_marginals
from snorkel.annotations import load_gold_labels
from snorkel.learning.pytorch import LSTM
from snorkel.models import Candidate, FeatureKey, candidate_subclass

# In[ ]:

edge_type = "dg"

# In[ ]:

if edge_type == "dg":
Esempio n. 12
0
import pandas as pd
from snorkel import SnorkelSession
from snorkel.candidates import PretaggedCandidateExtractor
from snorkel.models import Document, Sentence, candidate_subclass
from snorkel.parser import CorpusParser
from snorkel.viewer import SentenceNgramViewer
import tqdm

# In[ ]:

#Set up the environment
database_str = "sqlite:///" + os.environ[
    'WORKINGPATH'] + "/Database/epilepsy.db"
os.environ['SNORKELDB'] = database_str

session = SnorkelSession()

# # Parse the Pubmed Abstracts

# The code below is designed to read and parse data gathered from pubtator. Pubtator outputs their annotated text in xml format, so that is the standard file format we are going to use.

# In[ ]:

working_path = os.environ['WORKINGPATH']
xml_parser = XMLMultiDocPreprocessor(path=working_path +
                                     '/Database/epilepsy_data.xml',
                                     doc='.//document',
                                     text='.//passage/text/text()',
                                     id='.//id/text()')

# In[ ]:
import tqdm

# In[ ]:

#Set up the environment
username = "******"
password = "******"
dbname = "pubmeddb"

#Path subject to change for different os
database_str = "postgresql+psycopg2://{}:{}@/{}?host=/var/run/postgresql".format(
    username, password, dbname)
os.environ['SNORKELDB'] = database_str

from snorkel import SnorkelSession
session = SnorkelSession()

# In[ ]:

from snorkel.annotations import FeatureAnnotator, LabelAnnotator
from snorkel.features import get_span_feats
from snorkel.models import candidate_subclass
from snorkel.models import Candidate
from snorkel.viewer import SentenceNgramViewer

# In[ ]:

edge_type = "dg"
debug = False

# In[ ]:
Esempio n. 14
0
sns.set(rc={'figure.figsize': (12, 6), "font.size": 17})

# In[2]:

#Set up the environment
username = "******"
password = "******"
dbname = "pubmeddb"

#Path subject to change for different os
database_str = "postgresql+psycopg2://{}:{}@/{}?host=/var/run/postgresql".format(
    username, password, dbname)
os.environ['SNORKELDB'] = database_str

from snorkel import SnorkelSession
session = SnorkelSession()

# In[3]:

from snorkel.models import candidate_subclass, Candidate
DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene'])

# In[4]:

from utils.notebook_utils.dataframe_helper import write_candidates_to_excel, make_sentence_df

# ## Load and Merge DataFrames

# In[5]:

edge_level_df = pd.read_csv("input/disease_associates_gene.tsv.xz", sep="\t")
Esempio n. 15
0

# In[ ]:


#Set up the environment
username = "******"
password = "******"
dbname = "pubmeddb"

#Path subject to change for different os
database_str = "postgresql+psycopg2://{}:{}@/{}?host=/var/run/postgresql".format(username, password, dbname)
os.environ['SNORKELDB'] = database_str

from snorkel import SnorkelSession
session = SnorkelSession()


# In[ ]:


from snorkel.annotations import LabelAnnotator
from snorkel.models import candidate_subclass
from snorkel.models import Candidate
from snorkel.viewer import SentenceNgramViewer


# In[ ]:


DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene'])
Esempio n. 16
0
with open('/dfs/scratch1/jdunnmon/data/memex-data/config/config.json') as fl:
    config = json.load(fl)

# Changing directory to code area
os.chdir(config['homedir'])

#For PostgreSQL
postgres_db_name = os.path.split(args['file'])[-1].split('.')[0]
os.environ['SNORKELDB'] = os.path.join(config['postgres_location'],
                                       postgres_db_name)

print(f"Env: {os.environ['SNORKELDB']}")

# Start Snorkel session
from snorkel import SnorkelSession
session = SnorkelSession()

# Setting parallelism
parallelism = config['parallelism']

# Setting random seed
seed = config['seed']
random.seed(seed)
np.random.seed(seed)

# Set data source: options are 'content.tsv', 'memex_jsons', 'es'
data_source = config['data_source']

# Setting max number of docs to ingest
max_docs = config['max_docs']