Beispiel #1
0
# In[ ]:

#This specifies that I want candidates that have a disease and gene mentioned in a given sentence
DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene'])

# In[ ]:

ce = PretaggedCandidateExtractor(DiseaseGene, ['Disease', 'Gene'])

# In[ ]:

#Get the candidates from my custom tagger and then print number of candidates found
for k, sents in enumerate([train_sents, dev_sents, test_sents]):
    ce.apply(sents, split=k)
    print "Number of Candidates: ", session.query(DiseaseGene).filter(
        DiseaseGene.split == k).count()

# # Look at the Potential Candidates

# The one cool thing about jupyter is that you can use this tool to look at candidates. Check it out after everything above has finished running

# In[ ]:

candidates = session.query(DiseaseGene).filter(DiseaseGene.split == 1)
sv = SentenceNgramViewer(candidates, session)

# In[ ]:

sv
Beispiel #2
0
                     'articles/disease_gold_labels.tsv',
                     dev_cands,
                     annotator_name='gold')

L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_dev
L_dev = labeler.apply_existing(split=1)
_ = gen_model.score(session, L_dev, L_gold_dev)
L_dev.lf_stats(session, L_gold_dev, gen_model.learned_lf_stats()['Accuracy'])

labeled = []
for c in session.query(BiomarkerCondition).filter(
        BiomarkerCondition.split == 1).all():
    if LF_markerDatabase(c) == 1:
        labeled.append(c)
SentenceNgramViewer(labeled, session, n_per_page=3)

# Load dev labels and convert to [0, 1] range
L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
dev_labels = (np.ravel(L_gold_dev.todense()) + 1) / 2

# Feature Extraction
featurizer = FeatureAnnotator()
F_train = featurizer.apply(split=0)
F_train
F_dev = featurizer.apply_existing(split=1)
F_test = featurizer.apply_existing(split=2)

train_marginals = load_marginals(session, F_train, split=0)
disc_model = SparseLogisticRegression()
Beispiel #3
0

train_candidate_ids = train_df.candidate_id.astype(int)


# In[ ]:


candidates = (
    session
    .query(CompoundGene)
    .filter(CompoundGene.id.in_(train_candidate_ids))
    .limit(10)
    .offset(0)
)
sv = SentenceNgramViewer(candidates, session)


# In[ ]:


sv


# In[ ]:


c = sv.get_selected()
c

Beispiel #4
0
plot_pr_curve(gen_model_history_df, dev_data.curated_dsh,
              gen_model_history_df.columns, "PR Curve of Generative Models")

# ## Individual Candidate Error Analysis

# Depending on which block of code is executed, the following block of code below will show which candidate sentence was incorrectly labeled. Right now the false negatives (fn) are being shown below but this could change to incorporate false positives (fp) as well.

# In[ ]:

from snorkel.viewer import SentenceNgramViewer

# NOTE: This if-then statement is only to avoid opening the viewer during automated testing of this notebook
# You should ignore this!
import os
if 'CI' not in os.environ:
    sv = SentenceNgramViewer(indep_results['CG_ALL'][1], session)
else:
    sv = None

# In[ ]:

sv

# In[ ]:

c = sv.get_selected() if sv else list(fp.union(fn))[0]
c

# In[ ]:

c.labels