コード例 #1
0
    def preprocess_test(self):
        doc_original = "Environmental Protection Agency (EPA). government" \
            " organization (GO). Health and Human Services (HHS). While" \
               " EPA HHS GO different agencies"

        doc = "Environmental Protection Agency \nEPA \ngovernment" \
            " organization \nGO \nHealth and Human Services \nHHS \nWhile" \
            " EPA HHS GO different agencies"

        counter = self.parser(doc_original)

        replacer = replace_acronyms(
            counter,
            preprocessed=True,
            underscore=False)
        doc_new = replacer(doc, counter)

        doc_right = ("Environmental Protection Agency\nEnvironmental "
                     "Protection Agency\ngovernment organization"
                     "\ngovernment organization\nHealth and Human "
                     "Services\nHealth and Human Services\nWhile "
                     "Environmental Protection Agency Health and Human"
                     " Services government organization different agencies")

        assert_equal(doc_new, doc_right)
コード例 #2
0
    def lowercase_first_letter_match_test(self):
        # This test currently fails as siRNA isn't parsed correctly.

        doc = 'Small interfering RNA (siRNA) mediated depletion of EZH2.'
        counter = self.parser(doc)
        replacer = replace_acronyms(counter, prefix='ABBR', underscore=True)
        doc_new = replacer(doc)
コード例 #3
0
 def acronym_without_counter_test(self):
     doc = ("The Environmental Protection Agency (EPA) was created by "
            "Nixon. The EPA helps the environment")
     replacer = replace_acronyms(underscore=False)
     doc_new = replacer(doc)
     doc_right = (
         "The Environmental Protection Agency ( Environmental "
         "Protection Agency ) was created by Nixon .\n"
         "The Environmental Protection Agency helps the environment")
コード例 #4
0
    def use_most_common_test(self):
        doc1 = "The Environmental Protection Agency (EPA) was created by Nixon"
        doc2 = "The EPA helps the environment"

        doc_right = "The Environmental_Protection_Agency helps the environment"
        counter = self.parser(doc1)
        doc_counter = self.parser(doc2)

        replacer = replace_acronyms(counter, use_most_common=True)
        doc_new = replacer(doc2, doc_counter)
        assert_equal(doc_new, doc_right)
コード例 #5
0
    def dash_test(self):
        doc = 'Im not saying its a great non-Hodgkins lymphoma (NHL). Its a ' \
              'good Hodgkins'
        counter = self.parser(doc)
        replacer = replace_acronyms(counter, prefix='ABBR', underscore=True)
        doc_new = replacer(doc)

        doc_right = 'Im not saying its a great ABBR_non_Hodgkins_lymphoma ' \
                    '( ABBR_non_Hodgkins_lymphoma ) .\nIts a good Hodgkins'

        assert_equal(doc_right, doc_new)
コード例 #6
0
    def different_docs_test(self):
        doc1 = 'The Environmental Protection Agency (EPA) was created by Nixon'
        doc2 = 'The EPA helps the environment'

        doc_right = 'The Environmental Protection Agency helps the environment'
        counter = self.parser(doc1)
        doc_counter = self.parser(doc2)

        replacer = replace_acronyms(counter, underscore=False)
        doc_new = replacer(doc2, doc_counter)
        assert_equal(doc_new, doc_right)
コード例 #7
0
    def ignore_most_common_test(self):
        doc1 = 'The Environmental Protection Agency (EPA) was created by Nixon'
        doc2 = 'The EPA helps the environment'

        doc_right = 'The EPA helps the environment'
        counter = self.parser(doc1)
        doc_counter = self.parser(doc2)

        replacer = replace_acronyms(counter, use_most_common=False)
        doc_new = replacer(doc2, doc_counter)
        assert_equal(doc_new, doc_right)
コード例 #8
0
    def dash_test(self):
        doc = ("It is not a great non-Hodgkins lymphoma (NHL), but it is "
               "a good Hodgkins.")
        counter = self.parser(doc)
        replacer = replace_acronyms(counter, prefix="ABBR", underscore=True)
        doc_new = replacer(doc)

        doc_right = (
            "It is not a great ABBR_non_Hodgkins_lymphoma "
            "( ABBR_non_Hodgkins_lymphoma ) , but it is a good Hodgkins .")

        assert_equal(doc_right, doc_new)
コード例 #9
0
    def tokenize_phrase_test(self):
        doc = "The Environmental Protection Agency (EPA) was created by " \
              "Nixon. The EPA helps the environment"
        counter = self.parser(doc)

        replacer = replace_acronyms(counter, underscore=True)
        doc_new = replacer(doc)
        doc_right = "The Environmental_Protection_Agency ( Environmental_" \
                    "Protection_Agency ) was created by Nixon .\n" \
                    "The Environmental_Protection_Agency helps the environment"

        assert_equal(doc_right, doc_new)
コード例 #10
0
    def almost_acronym_but_lowercase_test(self):
        doc = "The Environmental Protection Agency (EPA) was created by " \
              "Nixon. The epa helps the environment"
        counter = self.parser(doc)

        replacer = replace_acronyms(counter, underscore=False)
        doc_new = replacer(doc, counter)
        doc_right = "The Environmental Protection Agency ( Environmental " \
                    "Protection Agency ) was created by Nixon .\nThe epa " \
                    "helps the environment"

        assert_equal(doc_right, doc_new)
コード例 #11
0
    def parsing_parenthesis_test(self):
        doc = 'BEACH (beige and Chediak Higashi) domain containing proteins (BDCPs) ' \
              'are a highly conserved protein family in eukaryotes.'
        ABBR = {(('BEACH', 'domain', 'containing', 'proteins'), 'BDCPs'): 1}
        P1 = replace_acronyms(ABBR)
        doc_new = P1(doc)

        doc_right = 'BEACH ( beige and Chediak Higashi ) domain containing proteins ' \
                    '( BEACH_domain_containing_proteins ) are a highly conserved '\
                    'protein family in eukaryotes .'

        assert_equal(doc_new, doc_right)
コード例 #12
0
    def counter_must_be_infered_test(self):
        doc = "The Environmental Protection Agency (EPA) was created by " \
              "Nixon. The EPA helps the environment"
        counter = self.parser(doc)

        replacer = replace_acronyms(counter, underscore=False)
        doc_new = replacer(doc)
        doc_right = "The Environmental Protection Agency ( Environmental " \
                    "Protection Agency ) was created by Nixon .\n" \
                    "The Environmental Protection Agency helps the environment"

        assert_equal(doc_right, doc_new)
コード例 #13
0
    def acronym_in_same_doc_additional_words_test(self):
        doc = 'I love the Americans with Disabilities Act (ADA). The ADA ' \
              'saved my life'
        counter = self.parser(doc)

        replacer = replace_acronyms(counter, underscore=False)
        doc_new = replacer(doc, counter)

        doc_right = 'I love the Americans with Disabilities Act ( Americans ' \
                    'with Disabilities Act ) .\nThe Americans with' \
                    ' Disabilities Act saved my life'

        assert_equal(doc_new, doc_right)
コード例 #14
0
    def tokenize_phrase_suffix_test(self):
        doc = ("The Environmental Protection Agency (EPA) was created by "
               "Nixon. The EPA helps the environment")
        counter = self.parser(doc)

        replacer = replace_acronyms(counter, suffix="ABBR", underscore=True)
        doc_new = replacer(doc)
        doc_right = ("The Environmental_Protection_Agency_ABBR ( "
                     "Environmental_Protection_Agency_ABBR ) was created by "
                     "Nixon .\nThe Environmental_Protection_Agency_ABBR "
                     "helps the environment")

        assert_equal(doc_right, doc_new)
コード例 #15
0
    def acronym_in_same_doc_test(self):
        doc = ("The Environmental Protection Agency (EPA) was created by "
               "Nixon. The EPA helps the environment")
        counter = self.parser(doc)

        replacer = replace_acronyms(counter, underscore=False)
        doc_new = replacer(doc, counter)
        doc_right = (
            "The Environmental Protection Agency ( Environmental "
            "Protection Agency ) was created by Nixon .\n"
            "The Environmental Protection Agency helps the environment")

        assert_equal(doc_right, doc_new)
コード例 #16
0
    def acronym_in_same_doc_underscore_default_test(self):
        doc = "The Environmental Protection Agency (EPA) was created by " \
              "Nixon. The EPA helps the environment"
        counter = self.parser(doc)

        replacer = replace_acronyms(counter)
        doc_new = replacer(doc, counter)
        doc_right = "The Environmental_Protection_Agency " \
                    "( Environmental_Protection_Agency ) was created by " \
                    "Nixon .\nThe Environmental_Protection_Agency helps the " \
                    "environment"

        assert_equal(doc_right, doc_new)
コード例 #17
0
    def parsing_parenthesis_test(self):
        doc = (
            "BEACH (beige and Chediak Higashi) domain containing proteins (BDCPs) "
            "are a highly conserved protein family in eukaryotes.")
        ABBR = {(("BEACH", "domain", "containing", "proteins"), "BDCPs"): 1}
        P1 = replace_acronyms(ABBR)
        doc_new = P1(doc)

        doc_right = (
            "BEACH ( beige and Chediak Higashi ) domain containing proteins "
            "( BEACH_domain_containing_proteins ) are a highly conserved "
            "protein family in eukaryotes .")

        assert_equal(doc_new, doc_right)
コード例 #18
0
    def duplicate_acronyms_test(self):
        doc1 = 'The Environmental Protection Agency (EPA) was created ' \
               'by Nixon. The Environmental Protection Agency (EPA) loves ' \
               'the tress. The less well known Elephant Protection Agency, called ' \
               ' the EPA, does important work as well.'

        doc2 = 'The EPA helps the environment'
        doc_right = 'The Environmental_Protection_Agency helps the environment'

        counter = self.parser(doc1)
        doc_counter = self.parser(doc2)

        replacer = replace_acronyms(counter, use_most_common=True)
        doc_new = replacer(doc2, doc_counter)
        assert_equal(doc_new, doc_right)
コード例 #19
0
ファイル: full_test.py プロジェクト: ssameerr/NLPre
    def __init__(self):
        self.location = os.path.realpath(
            os.path.join(os.getcwd(), os.path.dirname('doc1')))

        POS_Blacklist = set(("connector",
                             "cardinal",
                             "pronoun",
                             "adverb",
                             "symbol",
                             "verb",
                             "punctuation",
                             "modal_verb",
                             "w_word",))

        MeSH_dict = "dictionaries/"
        local_dir = os.path.dirname(os.path.abspath('nlpre/dictionaries'))

        f_MeSH = os.path.join(
            local_dir,
            MeSH_dict,
            'MeSH_two_word_lexicon.csv')

        self.unidecoder = unidecoder()

        self.dedash = dedash()
        self.titlecaps = titlecaps()
        self.replace_from_dict = replace_from_dictionary(f_MeSH)

        self.parenthetical = identify_parenthetical_phrases()

        self.separated_parenthesis = separated_parenthesis()
        self.token_replacement = token_replacement()
        self.decaps = decaps_text()
        self.separate_reference = separate_reference()
        self.pos_tokenizer = pos_tokenizer(POS_Blacklist)

        with codecs.open(self.location + '/tests/doc1', 'r', 'utf-8') as f1:
            self.doc1 = f1.read()

        with codecs.open(self.location + '/tests/doc2', 'r', 'utf-8') as f2:
            self.doc2 = f2.read()

        self.big_counter = self.acronym_counter()

        self.replace_abbreviation = replace_acronyms(self.big_counter,
                                                     prefix='ABBR',
                                                     preprocessed=False)
コード例 #20
0
    def count_multiple_docs_test(self):
        doc1 = "The Environmental Protection Agency (EPA) was created by Nixon"
        doc2 = 'The Environmental Protection Agency (EPA) loves the tress. '
        doc3 = 'The less well known Elephant Protection Agency (EPA) does ' \
               'important work as well.'

        doc_replace = 'The EPA helps the environment'
        doc_right = 'The Environmental_Protection_Agency helps the environment'
        counter1 = self.parser(doc1)
        counter2 = self.parser(doc2)
        counter3 = self.parser(doc3)

        doc_counter = self.parser(doc_replace)

        big_counter = counter1 + counter2 + counter3
        replacer = replace_acronyms(big_counter, use_most_common=True)
        doc_new = replacer(doc_replace, doc_counter)
        assert_equal(doc_new, doc_right)
コード例 #21
0
    def parsing_misidentifed_test(self):
        # We don't replace a common phrase in the ABBR set since
        # use_most_common=False by default.
        
        doc = "The etiology of osteoarthritis (OA) is at present unknown. "\
              "Primary OA utilizing the over 35 families."

        doc_right = "The etiology of osteoarthritis ( OA ) is at present unknown .\n"\
              "Primary OA utilizing the over 35 families ."

        counter = self.parser(doc)

        ABBR = {(('ocean', 'acidification',), 'OA'): 1}
        replacer = replace_acronyms(ABBR)
        
        doc_new = replacer(doc)

        assert_equal(doc_new, doc_right)
コード例 #22
0
    def duplicate_acronyms_included_test(self):
        doc1 = 'The Environmental Protection Agency (EPA) was created ' \
               'by Nixon. The Environmental Protection Agency (EPA) loves ' \
               'the trees. The less well known Elephant Protection Agency ' \
               '(EPA) does important work as well.'

        doc2 = 'The Ent Protection Agency (EPA) stopped Sauromon. ' \
               'The EPA helps the environment'

        doc_right = 'The Ent Protection Agency ( Ent Protection Agency ) ' \
                    'stopped Sauromon .\nThe Ent Protection Agency helps ' \
                    'the environment'

        counter = self.parser(doc1)
        doc_counter = self.parser(doc2)

        replacer = replace_acronyms(counter, underscore=False)
        doc_new = replacer(doc2, doc_counter)
        assert_equal(doc_new, doc_right)
コード例 #23
0
    def multiple_acronyms_same_doc_test(self):
        doc = "The Environmental Protection Agency (EPA) is not a government" \
            " organization (GO) of Health and Human Services (HHS). While" \
               " the EPA and HHS are both a GO, they are different agencies"

        counter = self.parser(doc)

        replacer = replace_acronyms(counter, underscore=False)
        doc_new = replacer(doc, counter)

        doc_right = ("The Environmental Protection Agency ( Environmental "
                     "Protection Agency ) is not a government organization "
                     "( government organization ) of Health and Human "
                     "Services ( Health and Human Services ) .\nWhile the "
                     "Environmental Protection Agency and Health and Human"
                     " Services are both a government organization , they"
                     " are different agencies")

        assert_equal(doc_new, doc_right)