def test_predict(self): model_path = fth.fasttext_fit(self.train_path, {'-bucket': 1000}, self.ft_path, model_path=self.model_path, thread=1, compress_model=False) fth.fasttext_predict(model_path, self.test_path, self.ft_path, self.probability_path) assert os.path.isfile(self.probability_path) os.remove(model_path) os.remove(self.probability_path)
def test_fasttext_class_probabilities(self): model_path = fth.fasttext_fit(self.train_path, {'-bucket': 1000}, self.ft_path, model_path=self.model_path, thread=1, compress_model=False) fth.fasttext_predict(model_path, self.test_path, self.ft_path, self.probability_path) probabilities = fth.load_fasttext_class_probabilities( self.probability_path) assert len(probabilities) == 40 assert all([x > 0.75 for x in probabilities[:20]]) assert all([x < 0.25 for x in probabilities[20:]]) os.remove(model_path) os.remove(self.probability_path)
'distance', 'pairs', 'key' ] del test_df['key'] del test_df['pairs'] perfomance = list() for i in range(repeats): model_file = fasttext_fit(ft_train_path, ft_params, fasttext_path, thread=ft_threads, compress_model=True, model_path=tmp_model_path + str(i), pretrained_vectors_path=pretrained_embeddings) fth.fasttext_predict(tmp_model_path + str(i) + '.ftz', ft_test_path, fasttext_path, prob_path) probabilities = fth.load_fasttext_class_probabilities(prob_path) test_df = test_df.assign(predicted=probabilities) _, tmp_file_path = tempfile.mkstemp(text=True, suffix='.gz') with gzip.open(tmp_file_path, 'wt') as test_out: test_df.to_csv(test_out, sep='\t', header=False, index=False, columns=[ 'pmid', 'paragraph', 'sentence', 'entity1', 'entity2', 'predicted' ]) val_score_dict = co_occurrence_score(matches_file_path=None,
val_df, ft_val_df = df.iloc[val_index], sentence_df.iloc[val_index] _, tmp_train_path = tempfile.mkstemp(text=True, suffix='.txt') with open(tmp_train_path, 'wt') as train_ft_out: ft_train_df.to_csv(train_ft_out, sep='\t', header=False, index=False) _, tmp_val_path = tempfile.mkstemp(text=True, suffix='.txt') with open(tmp_val_path, 'wt') as val_ft_out: ft_val_df.to_csv(val_ft_out, sep='\t', header=False, index=False) model_file = fasttext_fit(tmp_train_path, ft_params, fasttext_path, thread=ft_threads, compress_model=True, model_path=tmp_model_path, pretrained_vectors_path=pretrained_embeddings) fth.fasttext_predict(model_file, tmp_val_path, fasttext_path, prob_path) probabilities = fth.load_fasttext_class_probabilities(prob_path) val_df = val_df.assign(predicted=probabilities) _, tmp_file_path = tempfile.mkstemp(text=True, suffix='.gz') with gzip.open(tmp_file_path, 'wt') as test_out: val_df.to_csv(test_out, sep='\t', header=False, index=False, columns=['pmid', 'paragraph', 'sentence', 'entity1', 'entity2', 'predicted']) val_score_dict = co_occurrence_score(matches_file_path=None, score_file_path=tmp_file_path, entities_file=None, first_type=0, second_type=0, ignore_scores=False, silent=True, **cocoscore_params