Example #1
0
 def test_predict(self):
     model_path = fth.fasttext_fit(self.train_path, {'-bucket': 1000},
                                   self.ft_path,
                                   model_path=self.model_path,
                                   thread=1,
                                   compress_model=False)
     fth.fasttext_predict(model_path, self.test_path, self.ft_path,
                          self.probability_path)
     assert os.path.isfile(self.probability_path)
     os.remove(model_path)
     os.remove(self.probability_path)
Example #2
0
 def test_fasttext_class_probabilities(self):
     model_path = fth.fasttext_fit(self.train_path, {'-bucket': 1000},
                                   self.ft_path,
                                   model_path=self.model_path,
                                   thread=1,
                                   compress_model=False)
     fth.fasttext_predict(model_path, self.test_path, self.ft_path,
                          self.probability_path)
     probabilities = fth.load_fasttext_class_probabilities(
         self.probability_path)
     assert len(probabilities) == 40
     assert all([x > 0.75 for x in probabilities[:20]])
     assert all([x < 0.25 for x in probabilities[20:]])
     os.remove(model_path)
     os.remove(self.probability_path)
Example #3
0
    'distance', 'pairs', 'key'
]
del test_df['key']
del test_df['pairs']

perfomance = list()
for i in range(repeats):
    model_file = fasttext_fit(ft_train_path,
                              ft_params,
                              fasttext_path,
                              thread=ft_threads,
                              compress_model=True,
                              model_path=tmp_model_path + str(i),
                              pretrained_vectors_path=pretrained_embeddings)

    fth.fasttext_predict(tmp_model_path + str(i) + '.ftz', ft_test_path,
                         fasttext_path, prob_path)
    probabilities = fth.load_fasttext_class_probabilities(prob_path)
    test_df = test_df.assign(predicted=probabilities)

    _, tmp_file_path = tempfile.mkstemp(text=True, suffix='.gz')
    with gzip.open(tmp_file_path, 'wt') as test_out:
        test_df.to_csv(test_out,
                       sep='\t',
                       header=False,
                       index=False,
                       columns=[
                           'pmid', 'paragraph', 'sentence', 'entity1',
                           'entity2', 'predicted'
                       ])

    val_score_dict = co_occurrence_score(matches_file_path=None,
    val_df, ft_val_df = df.iloc[val_index], sentence_df.iloc[val_index]

    _, tmp_train_path = tempfile.mkstemp(text=True, suffix='.txt')
    with open(tmp_train_path, 'wt') as train_ft_out:
        ft_train_df.to_csv(train_ft_out, sep='\t', header=False, index=False)

    _, tmp_val_path = tempfile.mkstemp(text=True, suffix='.txt')
    with open(tmp_val_path, 'wt') as val_ft_out:
        ft_val_df.to_csv(val_ft_out, sep='\t', header=False, index=False)

    model_file = fasttext_fit(tmp_train_path, ft_params, fasttext_path, thread=ft_threads,
                              compress_model=True,
                              model_path=tmp_model_path,
                              pretrained_vectors_path=pretrained_embeddings)

    fth.fasttext_predict(model_file, tmp_val_path, fasttext_path, prob_path)
    probabilities = fth.load_fasttext_class_probabilities(prob_path)
    val_df = val_df.assign(predicted=probabilities)

    _, tmp_file_path = tempfile.mkstemp(text=True, suffix='.gz')
    with gzip.open(tmp_file_path, 'wt') as test_out:
        val_df.to_csv(test_out, sep='\t', header=False, index=False,
                      columns=['pmid', 'paragraph', 'sentence', 'entity1', 'entity2', 'predicted'])
    val_score_dict = co_occurrence_score(matches_file_path=None,
                                         score_file_path=tmp_file_path,
                                         entities_file=None,
                                         first_type=0,
                                         second_type=0,
                                         ignore_scores=False,
                                         silent=True,
                                         **cocoscore_params