def generate_embedded_df(candidates):
    words_to_embed = [(mark_sentence(
        candidate_to_tokens(cand),
        [(cand[0].get_word_start(), cand[0].get_word_end(), 1),
         (cand[1].get_word_start(), cand[1].get_word_end(), 2)]), cand.id)
                      for cand in tqdm_notebook(candidates)]
    embed_df = pd.DataFrame(
        list(map(lambda x: embed_word_to_index(x[0]), words_to_embed)))
    embed_df['candidate_id'] = list(map(lambda x: x[1], words_to_embed))
    embed_df['sen_length'] = list(map(lambda x: len(x[0]), words_to_embed))
    return embed_df
Ejemplo n.º 2
0
# In[15]:

words_to_embed = []
candidates = (session.query(GeneGene).filter(
    GeneGene.id.in_(
        total_candidates_df.sample(
            2500000,
            random_state=100).candidate_id.astype(int).tolist())).all())

# In[ ]:

for cand in tqdm_notebook(candidates):
    args = [(cand[0].get_word_start(), cand[0].get_word_end(), 1),
            (cand[1].get_word_start(), cand[1].get_word_end(), 2)]
    words_to_embed.append(mark_sentence(candidate_to_tokens(cand), args))

# In[ ]:

model = FastText(words_to_embed,
                 window=2,
                 negative=10,
                 iter=50,
                 sg=1,
                 workers=4,
                 alpha=0.005,
                 size=300,
                 seed=100)

# In[19]:

# In[ ]:

# Code to validate that the embedded rows align with the candidates
candidate_query = (total_candidates_df.query(
    "split==0&disease_mention_count==1&gene_mention_count==1").head(
        5).candidate_id.astype(int).tolist())
# Get the candidates
candidates = session.query(DiseaseGene).filter(
    DiseaseGene.id.in_(candidate_query)).all()

# Generate the correct output for each word
correct_words_to_embed = [
    (mark_sentence(candidate_to_tokens(cand),
                   [(cand[0].get_word_start(), cand[0].get_word_end(), 1),
                    (cand[1].get_word_start(), cand[1].get_word_end(), 2)]),
     cand.id) for cand in tqdm_notebook(candidates)
]

correct_embedded_words = list(
    map(lambda x: (embed_word_to_index(x[0]), x[1]), correct_words_to_embed))
embedded_train = generate_embedded_df(candidates)

# Confirm the dataframe contains the correct rows for each candidate
for words in correct_embedded_words:
    test_query = embedded_train.query("candidate_id==@words[1]").fillna(
        0).values.tolist()[0][:-2]
    for pair in zip(words[0], [col for col in test_query if col != 0]):
        assert pair[0] == pair[1]