# In[ ]: #This specifies that I want candidates that have a disease and gene mentioned in a given sentence DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene']) # In[ ]: ce = PretaggedCandidateExtractor(DiseaseGene, ['Disease', 'Gene']) # In[ ]: #Get the candidates from my custom tagger and then print number of candidates found for k, sents in enumerate([train_sents, dev_sents, test_sents]): ce.apply(sents, split=k) print "Number of Candidates: ", session.query(DiseaseGene).filter( DiseaseGene.split == k).count() # # Look at the Potential Candidates # The one cool thing about jupyter is that you can use this tool to look at candidates. Check it out after everything above has finished running # In[ ]: candidates = session.query(DiseaseGene).filter(DiseaseGene.split == 1) sv = SentenceNgramViewer(candidates, session) # In[ ]: sv
'articles/disease_gold_labels.tsv', dev_cands, annotator_name='gold') L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1) L_gold_dev L_dev = labeler.apply_existing(split=1) _ = gen_model.score(session, L_dev, L_gold_dev) L_dev.lf_stats(session, L_gold_dev, gen_model.learned_lf_stats()['Accuracy']) labeled = [] for c in session.query(BiomarkerCondition).filter( BiomarkerCondition.split == 1).all(): if LF_markerDatabase(c) == 1: labeled.append(c) SentenceNgramViewer(labeled, session, n_per_page=3) # Load dev labels and convert to [0, 1] range L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1) dev_labels = (np.ravel(L_gold_dev.todense()) + 1) / 2 # Feature Extraction featurizer = FeatureAnnotator() F_train = featurizer.apply(split=0) F_train F_dev = featurizer.apply_existing(split=1) F_test = featurizer.apply_existing(split=2) train_marginals = load_marginals(session, F_train, split=0) disc_model = SparseLogisticRegression()
train_candidate_ids = train_df.candidate_id.astype(int) # In[ ]: candidates = ( session .query(CompoundGene) .filter(CompoundGene.id.in_(train_candidate_ids)) .limit(10) .offset(0) ) sv = SentenceNgramViewer(candidates, session) # In[ ]: sv # In[ ]: c = sv.get_selected() c
plot_pr_curve(gen_model_history_df, dev_data.curated_dsh, gen_model_history_df.columns, "PR Curve of Generative Models") # ## Individual Candidate Error Analysis # Depending on which block of code is executed, the following block of code below will show which candidate sentence was incorrectly labeled. Right now the false negatives (fn) are being shown below but this could change to incorporate false positives (fp) as well. # In[ ]: from snorkel.viewer import SentenceNgramViewer # NOTE: This if-then statement is only to avoid opening the viewer during automated testing of this notebook # You should ignore this! import os if 'CI' not in os.environ: sv = SentenceNgramViewer(indep_results['CG_ALL'][1], session) else: sv = None # In[ ]: sv # In[ ]: c = sv.get_selected() if sv else list(fp.union(fn))[0] c # In[ ]: c.labels