def sentence_no_ammend_data_preprocessor(data_dict): from nltk.sentiment.vader import SentimentIntensityAnalyzer from autodiscern import transformations, annotations, model html_to_sentence_transformer = transformations.Transformer(leave_some_html=True, html_to_plain_text=True, segment_into='sentences', flatten=True, remove_newlines=False, annotate_html=True, parallelism=False) transformed_data = html_to_sentence_transformer.apply(data_dict) # do links before punctuation removal because need punc for link id, # and before metamap because otherwise metamap will find references to medical terms in links transformed_data = annotations.add_inline_citations_annotations(transformed_data) # remove punctuation with bad encodings before metamap because it messes up metamap character indexing # (grr encoding!) transformed_data = annotations.ammed_content_replace_bad_punctuation_encoding(transformed_data) transformed_data = annotations.add_metamap_annotations(transformed_data) transformed_data = annotations.add_ner_annotations(transformed_data) sid = SentimentIntensityAnalyzer() for key in transformed_data: transformed_data[key]['feature_vec'] = model.build_remaining_feature_vector(transformed_data[key], sid) return transformed_data
def test_html_to_limited_html(self): transformer = adt.Transformer(leave_some_html=True) test_input = self.test_input_1 self.expected_output[0][ 'content'] = """<h1>Antidepressants</h1> <h3>Antidepressants are medications primarily used for treating depression.</h3> <a href="emedtv"></a><h2>What Are Antidepressants?</h2> Antidepressants are medications used to treat <a href="emedtv">depression</a>. Some of these medications are blue. (Click <a href="emedtv">Antidepressant Uses</a> for more information on what they are used for, including possible <a href="emedtv">off-label</a> uses.) <a href="emedtv"></a><h2>Types of Antidepressants</h2> There are several types of antidepressants available to treat depression.""" output = transformer.apply(test_input) self.assertEqual(output, self.expected_output)
def test_html_to_limited_html_plain_text(self): transformer = adt.Transformer(leave_some_html=True, html_to_plain_text=True) test_input = self.test_input_1 self.expected_output[0][ 'content'] = """thisisah1tag Antidepressants. thisisah3tag Antidepressants are medications primarily used for treating depression. thisisalinktagemedtv thisisah2tag What Are Antidepressants? Antidepressants are medications used to treat thisisalinktagemedtv depression . Some of these medications are blue. (Click thisisalinktagemedtv Antidepressant Uses for more information on what they are used for, including possible thisisalinktagemedtv off-label uses.). thisisalinktagemedtv thisisah2tag Types of Antidepressants. There are several types of antidepressants available to treat depression.""" output = transformer.apply(test_input) self.assertEqual(output, self.expected_output)
def test_html_to_text(self): transformer = adt.Transformer(leave_some_html=False) test_input = self.test_input_1 self.expected_output[0][ 'content'] = """Antidepressants. Antidepressants are medications primarily used for treating depression. What Are Antidepressants? Antidepressants are medications used to treat depression. Some of these medications are blue. (Click Antidepressant Uses for more information on what they are used for, including possible off-label uses.). Types of Antidepressants. There are several types of antidepressants available to treat depression.""" output = transformer.apply(test_input) self.assertEqual(output, self.expected_output)
def test_html_to_text_to_paragraphs(self): transformer = adt.Transformer(leave_some_html=False, segment_into='paragraphs', remove_newlines=False) test_input = self.test_input_1 self.expected_output[0]['content'] = [ "Antidepressants. ", "Antidepressants are medications primarily used for treating depression. ", "What Are Antidepressants? ", "Antidepressants are medications used to treat depression. Some of these medications are blue. ", "(Click Antidepressant Uses for more information on what they are used for, including possible off-label uses.). ", "Types of Antidepressants. ", "There are several types of antidepressants available to treat depression.", ] output = transformer.apply(test_input) self.assertEqual(output, self.expected_output)
def test_replace_html_replaces_link_with_domain(self): test_input = BeautifulSoup( '<html><body>There is a <a href="google.com">link here</a>.</body></html>', features="html.parser") expected_output = 'There is a thisisalinktaggoogle link here.' tags_to_keep = set() tags_to_keep_with_attr = set() tags_to_replace_with_str = {'a': ('thisisalinktag ', '')} default_tag_replacement_str = '' transformer = adt.Transformer() test_output = transformer.replace_html(test_input, tags_to_keep, tags_to_keep_with_attr, tags_to_replace_with_str, default_tag_replacement_str, include_link_domains=True) self.assertEqual(test_output, expected_output)
def test_replace_html_replaces_tag(self): test_input = BeautifulSoup( '<html><body><h1>Heading1</h1></body></html>', features="html.parser") expected_output = 'thisisah1tag Heading1. ' tags_to_keep = set() tags_to_keep_with_attr = set() tags_to_replace_with_str = {'h1': ('thisisah1tag ', '. ')} default_tag_replacement_str = '' transformer = adt.Transformer() test_output = transformer.replace_html(test_input, tags_to_keep, tags_to_keep_with_attr, tags_to_replace_with_str, default_tag_replacement_str, include_link_domains=True) self.assertEqual(test_output, expected_output)
def test_replace_html(self): test_input = BeautifulSoup( '<html><body><h1 font="Blue">Heading1</h1><h2><i>Heading2</i></h2><a href="google.com">link here</a></p></body></html>', features="html.parser") expected_output = '<h1>Heading1</h1>thisisanh2tag Heading2.<a href="google.com">link here</a>' tags_to_keep = {'h1'} tags_to_keep_with_attr = {'a'} tags_to_replace_with_str = { 'h2': ('thisisanh2tag ', '.'), } default_tag_replacement_str = '' transformer = adt.Transformer() test_output = transformer.replace_html(test_input, tags_to_keep, tags_to_keep_with_attr, tags_to_replace_with_str, default_tag_replacement_str, include_link_domains=True) self.assertEqual(test_output, expected_output)
def make_prediction(predictors: Dict[Predictor], url: str) -> Dict: res = requests.get(url) html_page = res.content.decode("utf-8") data_dict = {0: {'entity_id': 0, 'content': html_page, 'url': url}} html_transformer = adt.Transformer(leave_some_html=True, html_to_plain_text=True, annotate_html=True, parallelism=False ) transformed_data = html_transformer.apply(data_dict) transformed_data = ada.add_inline_citations_annotations(transformed_data) transformed_data = ada.add_metamap_annotations(transformed_data, dm) sid = SentimentIntensityAnalyzer() for key in data_dict: transformed_data[key]['feature_vec'] = adm.build_remaining_feature_vector(transformed_data[key], sid) predictions = {} for q in predictors: predictions[q] = predictors[q].predict(data_dict[0]) return predictions
def test_html_to_limited_html_plain_text_to_sentences_flattened_annotated( self): transformer = adt.Transformer(leave_some_html=True, html_to_plain_text=True, segment_into='sentences', remove_newlines=False, flatten=True, annotate_html=True) test_input = self.test_input_1 expected_output = { '0-0': { 'id': 0, 'sub_id': 0, 'url': 'http://depression.emedtv.com/antidepressants/antidepressants.html', 'content': "Antidepressants.", 'html_tags': ['h1'], 'domains': [], 'link_type': [], }, '0-1': { 'id': 0, 'sub_id': 1, 'url': 'http://depression.emedtv.com/antidepressants/antidepressants.html', 'content': "Antidepressants are medications primarily used for treating depression.", 'html_tags': ['h3'], 'domains': [], 'link_type': [], }, '0-2': { 'id': 0, 'sub_id': 2, 'url': 'http://depression.emedtv.com/antidepressants/antidepressants.html', 'content': "What Are Antidepressants?", 'html_tags': ['h2', 'a'], 'domains': ['emedtv'], 'link_type': ['internal'], }, '0-3': { 'id': 0, 'sub_id': 3, 'url': 'http://depression.emedtv.com/antidepressants/antidepressants.html', 'content': "Antidepressants are medications used to treat depression .", 'html_tags': ['a'], 'domains': ['emedtv'], 'link_type': ['internal'], }, '0-4': { 'id': 0, 'sub_id': 4, 'url': 'http://depression.emedtv.com/antidepressants/antidepressants.html', 'content': "Some of these medications are blue.", 'html_tags': [], 'domains': [], 'link_type': [], }, '0-5': { 'id': 0, 'sub_id': 5, 'url': 'http://depression.emedtv.com/antidepressants/antidepressants.html', 'content': "(Click Antidepressant Uses for more information on what they are used for, including possible off-label uses.).", 'html_tags': ['a'], 'domains': ['emedtv', 'emedtv'], 'link_type': ['internal', 'internal'], }, '0-6': { 'id': 0, 'sub_id': 6, 'url': 'http://depression.emedtv.com/antidepressants/antidepressants.html', 'content': "Types of Antidepressants.", 'html_tags': ['h2', 'a'], 'domains': ['emedtv'], 'link_type': ['internal'], }, '0-7': { 'id': 0, 'sub_id': 7, 'url': 'http://depression.emedtv.com/antidepressants/antidepressants.html', 'content': "There are several types of antidepressants available to treat depression.", 'html_tags': [], 'domains': [], 'link_type': [], }, } output = transformer.apply(test_input) for id in list(expected_output.keys()): self.assertDictEqual(output[id], expected_output[id])
def biobert_predict(data_dict: dict, questions, experiment_dir, question_fold_map, to_gpu, gpu_index) -> Dict: """ Make an autoDiscern prediction for an article data_dict using the HEA BioBERT model. Includes all of the data preprocessing steps as were applied for the training of the HEA BioBERT model. Args: data_dict: dictionary of {id: sub-dict}, with sub-dictionary with keys ['url', 'content', 'id', 'responses'] Returns: autodiscern predictions for the article. """ check_for_non_git_files(check_metamap=False, check_biobert=True) working_dir = 'predict' model_path_within_pkg_resources = 'package_data/predictors/{}'.format( experiment_dir) experiment_model_dir = pkg_resources.resource_filename( 'autodiscern', model_path_within_pkg_resources) vocab_path_within_pkg_resources = 'package_data/pytorch_biobert/bert-base-cased-vocab.txt' vocab_path = pkg_resources.resource_filename( 'autodiscern', vocab_path_within_pkg_resources) processor_config = { 'tokenizer_max_sent_len': 300, 'label_cutoff': 3, 'label_avgmethod': 'round_mean' } # TODO: change this to a tempdir sents_embed_dir = pkg_resources.resource_filename( 'autodiscern', 'package_data/pytorch_biobert') bert_config = {'bert_train_flag': False, 'bert_all_output': False} state_dict_path_form = 'train_validation/question_{}/fold_{}/model_statedict/' config_path_form = 'test/question_{}/fold_0/config/' default_device = get_device(to_gpu=False) # --- q_partitions = create_prediction_qdoc_partitions(questions, question_fold_map) # run data processing # USED "2019-05-02_15-49-09_a0745f9_sent_level_MM.pkl" html_to_sentence_transformer = adt.Transformer( leave_some_html=True, html_to_plain_text=True, segment_into='sentences', flatten=True, remove_newlines=False, # in newer version annotate_html=True, parallelism=False) transformed_data = html_to_sentence_transformer.apply(data_dict) # load BERT model pytorch_dump_path = pkg_resources.resource_filename( 'autodiscern', 'package_data/pytorch_biobert') bert_for_pretrain = load_biobert_model(pytorch_dump_path, default_device) bertmodel = bert_for_pretrain.bert processor = build_DataDictProcessor(transformed_data, vocab_path, processor_config) tokenizer = BertTokenizer.from_pretrained(vocab_path, do_lower_case=False) # generate docs data tensor from the articles i.e. instance of class DocDataTensor docs_data_tensor = processor.generate_doctensor_from_articles(tokenizer) # create q_docpartitions q_docpartitions = {} for question in questions: q_docpartitions.update( generate_docpartition_per_question(docs_data_tensor, q_partitions, question)) # embed sentences print("Embedding sentences...") embed_sentences(docs_data_tensor, sents_embed_dir, bertmodel, bert_config, to_gpu, gpu_index) print(" ... Finished embedding sentences") # load model configs q_fold_config_map = {} for q in questions: config_path = os.path.join(experiment_model_dir, config_path_form.format(q)) mconfig, options = get_saved_config(config_path) argmax_indx = -1 q_fold_config_map[q] = (mconfig, options, argmax_indx) # load model state_dicts q_state_dict_path_map = {} for q in questions: state_dict_path = os.path.join( experiment_model_dir, state_dict_path_form.format(q, question_fold_map[q])) q_state_dict_path_map[q] = state_dict_path print("Running predict") results = run_predict(q_docpartitions, q_fold_config_map, bertmodel, q_state_dict_path_map, working_dir, sents_embed_dir, question_fold_map, to_gpu, gpu_index, num_epochs=1) proc_articles_repr = processor.articles_repr # TODO: do not run this if monfig['attnmodel_config'] is empty dict # currently if model ran with no attention then 'attention_weight_map' will be {} (i.e. empty dict) for q in results: if len(mconfig['attnmodel_config']) == 0: results[q]['attended_sentences'] = {} else: results[q]['attended_sentences'] = identify_attended_senteces( results[q]['attention_weight_map'], proc_articles_repr) return results