Esempio n. 1
0
    {"entrez_gene_id": int})
candidate_sentence_df.head(2)

# In[10]:

total_candidates_df = (edge_level_df.merge(candidate_sentence_df,
                                           on=["doid_id", "entrez_gene_id"]))
total_candidates_df.head(2)

# In[11]:

dev_candidates = (session.query(DiseaseGene).filter(
    DiseaseGene.id.in_(
        total_candidates_df.query("split==1").sample(
            10000, random_state=100).candidate_id.tolist())).all())
dev_df = make_sentence_df(dev_candidates)
dev_df.head(2)

# In[12]:

test_candidates = (session.query(DiseaseGene).filter(
    DiseaseGene.id.in_(
        total_candidates_df.query("split==2").sample(
            10000, random_state=120).candidate_id.tolist())).all())
test_df = make_sentence_df(test_candidates)
test_df.head(2)

# In[13]:

#write_candidates_to_excel(dev_df, "../data/sentences/sentence_labels_dev.xlsx")
#write_candidates_to_excel(test_df, "../data/sentences/sentence_labels_test.xlsx")
Esempio n. 2
0
    'dev': 'data/compound_disease/sentence_labels_dev.xlsx'
}


# In[ ]:


for sql, spreadsheet_name in zip(sql_statements, spreadsheet_names.values()):
    target_cids = [x[0] for x in session.execute(sql)]
    candidates = (
        session
        .query(CandidateClass)
        .filter(CandidateClass.id.in_(target_cids))
        .all()
    )
    candidate_df = make_sentence_df(candidates)
    write_candidates_to_excel(candidate_df, spreadsheet_name)


# # Develop Label Functions

# ## Look at potential Candidates

# Use this to look at loaded candidates from a given set. The constants represent the index to retrieve the appropiate set. Ideally, here is where one can look at a subset of the candidate and develop label functions for candidate labeling.

# In[ ]:


train_df = pd.read_excel(spreadsheet_names['train'])
train_df.head(2)