def generate_embedded_df(candidates):
    words_to_embed = [(mark_sentence(
        candidate_to_tokens(cand),
        [(cand[0].get_word_start(), cand[0].get_word_end(), 1),
         (cand[1].get_word_start(), cand[1].get_word_end(), 2)]), cand.id)
                      for cand in tqdm_notebook(candidates)]
    embed_df = pd.DataFrame(
        list(map(lambda x: embed_word_to_index(x[0]), words_to_embed)))
    embed_df['candidate_id'] = list(map(lambda x: x[1], words_to_embed))
    embed_df['sen_length'] = list(map(lambda x: len(x[0]), words_to_embed))
    return embed_df
def tag_sentence(x):
    candidates = (session.query(DiseaseGene).filter(
        DiseaseGene.id.in_(x.candidate_id.astype(int).tolist())).all())
    tagged_sen = [
        " ".join(
            mark_sentence(
                candidate_to_tokens(cand),
                [(cand[0].get_word_start(), cand[0].get_word_end(), 1),
                 (cand[1].get_word_start(), cand[1].get_word_end(), 2)]))
        for cand in candidates
    ]

    return tagged_sen
Esempio n. 3
0
    def _preprocess_data(self, candidates, extend=False):
        """Convert candidate sentences to lookup sequences
        
        :param candidates: candidates to process
        :param extend: extend symbol table for tokens (train), or lookup (test)?
        """
        if not hasattr(self, 'word_dict'):
            self.word_dict = SymbolTable()
        data = []
        for candidate in candidates:
            # Mark sentence
            args = [(candidate[i].get_word_start(),
                     candidate[i].get_word_end(), i + 1)
                    for i in range(len(candidate))]
            s = mark_sentence(candidate_to_tokens(candidate), args)
            # Either extend word table or retrieve from it
            f = self.word_dict.get if extend else self.word_dict.lookup
            data.append(np.array(list(map(f, s))))

        return data
Esempio n. 4
0
 def _preprocess_data(self, candidates, extend=False):
     """Convert candidate sentences to lookup sequences
     
     :param candidates: candidates to process
     :param extend: extend symbol table for tokens (train), or lookup (test)?
     """
     if not hasattr(self, 'word_dict'):
         self.word_dict = SymbolTable()
     data = []
     for candidate in candidates:
         # Mark sentence
         args = [
             (candidate[0].get_word_start(), candidate[0].get_word_end(), 1),
             (candidate[1].get_word_start(), candidate[1].get_word_end(), 2)
         ]
         s = mark_sentence(candidate_to_tokens(candidate), args)
         # Either extend word table or retrieve from it
         f = self.word_dict.get if extend else self.word_dict.lookup
         data.append(np.array(list(map(f, s))))
         
     return data
# In[15]:

words_to_embed = []
candidates = (session.query(GeneGene).filter(
    GeneGene.id.in_(
        total_candidates_df.sample(
            2500000,
            random_state=100).candidate_id.astype(int).tolist())).all())

# In[ ]:

for cand in tqdm_notebook(candidates):
    args = [(cand[0].get_word_start(), cand[0].get_word_end(), 1),
            (cand[1].get_word_start(), cand[1].get_word_end(), 2)]
    words_to_embed.append(mark_sentence(candidate_to_tokens(cand), args))

# In[ ]:

model = FastText(words_to_embed,
                 window=2,
                 negative=10,
                 iter=50,
                 sg=1,
                 workers=4,
                 alpha=0.005,
                 size=300,
                 seed=100)

# In[19]:
    return embed_df


# In[ ]:

# Code to validate that the embedded rows align with the candidates
candidate_query = (total_candidates_df.query(
    "split==0&disease_mention_count==1&gene_mention_count==1").head(
        5).candidate_id.astype(int).tolist())
# Get the candidates
candidates = session.query(DiseaseGene).filter(
    DiseaseGene.id.in_(candidate_query)).all()

# Generate the correct output for each word
correct_words_to_embed = [
    (mark_sentence(candidate_to_tokens(cand),
                   [(cand[0].get_word_start(), cand[0].get_word_end(), 1),
                    (cand[1].get_word_start(), cand[1].get_word_end(), 2)]),
     cand.id) for cand in tqdm_notebook(candidates)
]

correct_embedded_words = list(
    map(lambda x: (embed_word_to_index(x[0]), x[1]), correct_words_to_embed))
embedded_train = generate_embedded_df(candidates)

# Confirm the dataframe contains the correct rows for each candidate
for words in correct_embedded_words:
    test_query = embedded_train.query("candidate_id==@words[1]").fillna(
        0).values.tolist()[0][:-2]
    for pair in zip(words[0], [col for col in test_query if col != 0]):
        assert pair[0] == pair[1]