Example #1
0
def sentence_no_ammend_data_preprocessor(data_dict):
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    from autodiscern import transformations, annotations, model

    html_to_sentence_transformer = transformations.Transformer(leave_some_html=True,
                                                               html_to_plain_text=True,
                                                               segment_into='sentences',
                                                               flatten=True,
                                                               remove_newlines=False,
                                                               annotate_html=True,
                                                               parallelism=False)
    transformed_data = html_to_sentence_transformer.apply(data_dict)

    # do links before punctuation removal because need punc for link id,
    # and before metamap because otherwise metamap will find references to medical terms in links
    transformed_data = annotations.add_inline_citations_annotations(transformed_data)

    # remove punctuation with bad encodings before metamap because it messes up metamap character indexing
    # (grr encoding!)
    transformed_data = annotations.ammed_content_replace_bad_punctuation_encoding(transformed_data)
    transformed_data = annotations.add_metamap_annotations(transformed_data)

    transformed_data = annotations.add_ner_annotations(transformed_data)

    sid = SentimentIntensityAnalyzer()

    for key in transformed_data:
        transformed_data[key]['feature_vec'] = model.build_remaining_feature_vector(transformed_data[key], sid)

    return transformed_data
Example #2
0
    def test_html_to_limited_html(self):
        transformer = adt.Transformer(leave_some_html=True)

        test_input = self.test_input_1
        self.expected_output[0][
            'content'] = """<h1>Antidepressants</h1> <h3>Antidepressants are medications primarily used for treating depression.</h3> <a href="emedtv"></a><h2>What Are Antidepressants?</h2> Antidepressants are medications used to treat <a href="emedtv">depression</a>. Some of these medications are blue. (Click <a href="emedtv">Antidepressant Uses</a> for more information on what they are used for, including possible <a href="emedtv">off-label</a> uses.) <a href="emedtv"></a><h2>Types of Antidepressants</h2> There are several types of antidepressants available to treat depression."""
        output = transformer.apply(test_input)
        self.assertEqual(output, self.expected_output)
Example #3
0
    def test_html_to_limited_html_plain_text(self):
        transformer = adt.Transformer(leave_some_html=True,
                                      html_to_plain_text=True)

        test_input = self.test_input_1
        self.expected_output[0][
            'content'] = """thisisah1tag Antidepressants. thisisah3tag Antidepressants are medications primarily used for treating depression. thisisalinktagemedtv thisisah2tag What Are Antidepressants? Antidepressants are medications used to treat thisisalinktagemedtv depression . Some of these medications are blue. (Click thisisalinktagemedtv Antidepressant Uses for more information on what they are used for, including possible thisisalinktagemedtv off-label uses.). thisisalinktagemedtv thisisah2tag Types of Antidepressants. There are several types of antidepressants available to treat depression."""
        output = transformer.apply(test_input)
        self.assertEqual(output, self.expected_output)
Example #4
0
    def test_html_to_text(self):
        transformer = adt.Transformer(leave_some_html=False)

        test_input = self.test_input_1
        self.expected_output[0][
            'content'] = """Antidepressants. Antidepressants are medications primarily used for treating depression. What Are Antidepressants? Antidepressants are medications used to treat depression. Some of these medications are blue. (Click Antidepressant Uses for more information on what they are used for, including possible off-label uses.). Types of Antidepressants. There are several types of antidepressants available to treat depression."""

        output = transformer.apply(test_input)
        self.assertEqual(output, self.expected_output)
Example #5
0
    def test_html_to_text_to_paragraphs(self):
        transformer = adt.Transformer(leave_some_html=False,
                                      segment_into='paragraphs',
                                      remove_newlines=False)

        test_input = self.test_input_1
        self.expected_output[0]['content'] = [
            "Antidepressants. ",
            "Antidepressants are medications primarily used for treating depression. ",
            "What Are Antidepressants? ",
            "Antidepressants are medications used to treat depression. Some of these medications are blue. ",
            "(Click Antidepressant Uses for more information on what they are used for, including possible off-label uses.). ",
            "Types of Antidepressants. ",
            "There are several types of antidepressants available to treat depression.",
        ]

        output = transformer.apply(test_input)
        self.assertEqual(output, self.expected_output)
Example #6
0
    def test_replace_html_replaces_link_with_domain(self):
        test_input = BeautifulSoup(
            '<html><body>There is a <a href="google.com">link here</a>.</body></html>',
            features="html.parser")
        expected_output = 'There is a thisisalinktaggoogle link here.'

        tags_to_keep = set()
        tags_to_keep_with_attr = set()
        tags_to_replace_with_str = {'a': ('thisisalinktag ', '')}
        default_tag_replacement_str = ''
        transformer = adt.Transformer()
        test_output = transformer.replace_html(test_input,
                                               tags_to_keep,
                                               tags_to_keep_with_attr,
                                               tags_to_replace_with_str,
                                               default_tag_replacement_str,
                                               include_link_domains=True)
        self.assertEqual(test_output, expected_output)
Example #7
0
    def test_replace_html_replaces_tag(self):
        test_input = BeautifulSoup(
            '<html><body><h1>Heading1</h1></body></html>',
            features="html.parser")
        expected_output = 'thisisah1tag Heading1. '

        tags_to_keep = set()
        tags_to_keep_with_attr = set()
        tags_to_replace_with_str = {'h1': ('thisisah1tag ', '. ')}
        default_tag_replacement_str = ''
        transformer = adt.Transformer()
        test_output = transformer.replace_html(test_input,
                                               tags_to_keep,
                                               tags_to_keep_with_attr,
                                               tags_to_replace_with_str,
                                               default_tag_replacement_str,
                                               include_link_domains=True)
        self.assertEqual(test_output, expected_output)
Example #8
0
    def test_replace_html(self):
        test_input = BeautifulSoup(
            '<html><body><h1 font="Blue">Heading1</h1><h2><i>Heading2</i></h2><a href="google.com">link here</a></p></body></html>',
            features="html.parser")
        expected_output = '<h1>Heading1</h1>thisisanh2tag Heading2.<a href="google.com">link here</a>'

        tags_to_keep = {'h1'}
        tags_to_keep_with_attr = {'a'}
        tags_to_replace_with_str = {
            'h2': ('thisisanh2tag ', '.'),
        }
        default_tag_replacement_str = ''
        transformer = adt.Transformer()
        test_output = transformer.replace_html(test_input,
                                               tags_to_keep,
                                               tags_to_keep_with_attr,
                                               tags_to_replace_with_str,
                                               default_tag_replacement_str,
                                               include_link_domains=True)
        self.assertEqual(test_output, expected_output)
Example #9
0
def make_prediction(predictors: Dict[Predictor], url: str) -> Dict:
    res = requests.get(url)
    html_page = res.content.decode("utf-8")
    data_dict = {0: {'entity_id': 0, 'content': html_page, 'url': url}}

    html_transformer = adt.Transformer(leave_some_html=True,
                                       html_to_plain_text=True,
                                       annotate_html=True,
                                       parallelism=False
                                       )
    transformed_data = html_transformer.apply(data_dict)
    transformed_data = ada.add_inline_citations_annotations(transformed_data)
    transformed_data = ada.add_metamap_annotations(transformed_data, dm)

    sid = SentimentIntensityAnalyzer()

    for key in data_dict:
        transformed_data[key]['feature_vec'] = adm.build_remaining_feature_vector(transformed_data[key], sid)

    predictions = {}
    for q in predictors:
        predictions[q] = predictors[q].predict(data_dict[0])

    return predictions
Example #10
0
    def test_html_to_limited_html_plain_text_to_sentences_flattened_annotated(
            self):
        transformer = adt.Transformer(leave_some_html=True,
                                      html_to_plain_text=True,
                                      segment_into='sentences',
                                      remove_newlines=False,
                                      flatten=True,
                                      annotate_html=True)

        test_input = self.test_input_1
        expected_output = {
            '0-0': {
                'id': 0,
                'sub_id': 0,
                'url':
                'http://depression.emedtv.com/antidepressants/antidepressants.html',
                'content': "Antidepressants.",
                'html_tags': ['h1'],
                'domains': [],
                'link_type': [],
            },
            '0-1': {
                'id': 0,
                'sub_id': 1,
                'url':
                'http://depression.emedtv.com/antidepressants/antidepressants.html',
                'content':
                "Antidepressants are medications primarily used for treating depression.",
                'html_tags': ['h3'],
                'domains': [],
                'link_type': [],
            },
            '0-2': {
                'id': 0,
                'sub_id': 2,
                'url':
                'http://depression.emedtv.com/antidepressants/antidepressants.html',
                'content': "What Are Antidepressants?",
                'html_tags': ['h2', 'a'],
                'domains': ['emedtv'],
                'link_type': ['internal'],
            },
            '0-3': {
                'id': 0,
                'sub_id': 3,
                'url':
                'http://depression.emedtv.com/antidepressants/antidepressants.html',
                'content':
                "Antidepressants are medications used to treat depression .",
                'html_tags': ['a'],
                'domains': ['emedtv'],
                'link_type': ['internal'],
            },
            '0-4': {
                'id': 0,
                'sub_id': 4,
                'url':
                'http://depression.emedtv.com/antidepressants/antidepressants.html',
                'content': "Some of these medications are blue.",
                'html_tags': [],
                'domains': [],
                'link_type': [],
            },
            '0-5': {
                'id': 0,
                'sub_id': 5,
                'url':
                'http://depression.emedtv.com/antidepressants/antidepressants.html',
                'content':
                "(Click Antidepressant Uses for more information on what they are used for, including possible off-label uses.).",
                'html_tags': ['a'],
                'domains': ['emedtv', 'emedtv'],
                'link_type': ['internal', 'internal'],
            },
            '0-6': {
                'id': 0,
                'sub_id': 6,
                'url':
                'http://depression.emedtv.com/antidepressants/antidepressants.html',
                'content': "Types of Antidepressants.",
                'html_tags': ['h2', 'a'],
                'domains': ['emedtv'],
                'link_type': ['internal'],
            },
            '0-7': {
                'id': 0,
                'sub_id': 7,
                'url':
                'http://depression.emedtv.com/antidepressants/antidepressants.html',
                'content':
                "There are several types of antidepressants available to treat depression.",
                'html_tags': [],
                'domains': [],
                'link_type': [],
            },
        }

        output = transformer.apply(test_input)
        for id in list(expected_output.keys()):
            self.assertDictEqual(output[id], expected_output[id])
def biobert_predict(data_dict: dict, questions, experiment_dir,
                    question_fold_map, to_gpu, gpu_index) -> Dict:
    """
    Make an autoDiscern prediction for an article data_dict using the HEA BioBERT model. Includes all of the data
    preprocessing steps as were applied for the training of the HEA BioBERT model.

    Args:
        data_dict: dictionary of {id: sub-dict}, with sub-dictionary with keys ['url', 'content', 'id', 'responses']

    Returns: autodiscern predictions for the article.

    """
    check_for_non_git_files(check_metamap=False, check_biobert=True)

    working_dir = 'predict'
    model_path_within_pkg_resources = 'package_data/predictors/{}'.format(
        experiment_dir)
    experiment_model_dir = pkg_resources.resource_filename(
        'autodiscern', model_path_within_pkg_resources)

    vocab_path_within_pkg_resources = 'package_data/pytorch_biobert/bert-base-cased-vocab.txt'
    vocab_path = pkg_resources.resource_filename(
        'autodiscern', vocab_path_within_pkg_resources)
    processor_config = {
        'tokenizer_max_sent_len': 300,
        'label_cutoff': 3,
        'label_avgmethod': 'round_mean'
    }

    # TODO: change this to a tempdir
    sents_embed_dir = pkg_resources.resource_filename(
        'autodiscern', 'package_data/pytorch_biobert')
    bert_config = {'bert_train_flag': False, 'bert_all_output': False}

    state_dict_path_form = 'train_validation/question_{}/fold_{}/model_statedict/'
    config_path_form = 'test/question_{}/fold_0/config/'

    default_device = get_device(to_gpu=False)

    # ---

    q_partitions = create_prediction_qdoc_partitions(questions,
                                                     question_fold_map)

    # run data processing
    # USED "2019-05-02_15-49-09_a0745f9_sent_level_MM.pkl"
    html_to_sentence_transformer = adt.Transformer(
        leave_some_html=True,
        html_to_plain_text=True,
        segment_into='sentences',
        flatten=True,
        remove_newlines=False,  # in newer version
        annotate_html=True,
        parallelism=False)
    transformed_data = html_to_sentence_transformer.apply(data_dict)

    # load BERT model
    pytorch_dump_path = pkg_resources.resource_filename(
        'autodiscern', 'package_data/pytorch_biobert')
    bert_for_pretrain = load_biobert_model(pytorch_dump_path, default_device)
    bertmodel = bert_for_pretrain.bert

    processor = build_DataDictProcessor(transformed_data, vocab_path,
                                        processor_config)
    tokenizer = BertTokenizer.from_pretrained(vocab_path, do_lower_case=False)

    # generate docs data tensor from the articles i.e. instance of class DocDataTensor
    docs_data_tensor = processor.generate_doctensor_from_articles(tokenizer)

    # create q_docpartitions
    q_docpartitions = {}
    for question in questions:
        q_docpartitions.update(
            generate_docpartition_per_question(docs_data_tensor, q_partitions,
                                               question))

    # embed sentences
    print("Embedding sentences...")
    embed_sentences(docs_data_tensor, sents_embed_dir, bertmodel, bert_config,
                    to_gpu, gpu_index)
    print(" ... Finished embedding sentences")

    # load model configs
    q_fold_config_map = {}
    for q in questions:
        config_path = os.path.join(experiment_model_dir,
                                   config_path_form.format(q))
        mconfig, options = get_saved_config(config_path)
        argmax_indx = -1
        q_fold_config_map[q] = (mconfig, options, argmax_indx)

    # load model state_dicts
    q_state_dict_path_map = {}
    for q in questions:
        state_dict_path = os.path.join(
            experiment_model_dir,
            state_dict_path_form.format(q, question_fold_map[q]))
        q_state_dict_path_map[q] = state_dict_path

    print("Running predict")
    results = run_predict(q_docpartitions,
                          q_fold_config_map,
                          bertmodel,
                          q_state_dict_path_map,
                          working_dir,
                          sents_embed_dir,
                          question_fold_map,
                          to_gpu,
                          gpu_index,
                          num_epochs=1)

    proc_articles_repr = processor.articles_repr
    # TODO: do not run this if monfig['attnmodel_config'] is empty dict
    # currently if model ran with no attention then 'attention_weight_map' will be {} (i.e. empty dict)
    for q in results:
        if len(mconfig['attnmodel_config']) == 0:
            results[q]['attended_sentences'] = {}
        else:
            results[q]['attended_sentences'] = identify_attended_senteces(
                results[q]['attention_weight_map'], proc_articles_repr)
    return results