def clean_text(text):
    if not text:
        return ''

    abbreviations = identify_parenthetical_phrases()(text)
    parsers = [
        dedash(),
        titlecaps(),
        decaps_text(),
        unidecoder(),
        separate_reference(),
        url_replacement(),
        replace_acronyms(counter=abbreviations, underscore=False),
        pos_tokenizer(pre_pos_blacklist),
        token_replacement(remove=True),
        replace_from_dictionary(),
        pos_tokenizer(post_pos_blacklist)
    ]

    for parser in parsers:
        text = parser(text)

    text = remove_stopwords(text)
    text = lemmatize(text)

    return text
    def setup_class(cls):

        POS_Blacklist = set((
            "connector",
            "cardinal",
            "pronoun",
            "adverb",
            "symbol",
            "verb",
            "punctuation",
            "modal_verb",
            "w_word",
        ))

        cls.parser = pos_tokenizer(POS_Blacklist)
Beispiel #3
0
    def setup_class(cls):

        POS_blacklist = set(
            (
                "connector",
                "cardinal",
                "pronoun",
                "adverb",
                "symbol",
                "verb",
                "adjective",
                "punctuation",
                "possessive",
                "unknown",
            )
        )

        cls.parser = pos_tokenizer(POS_blacklist)
Beispiel #4
0
    def cardinal_word_test(self):
        doc = "There are two phases."
        doc_right = "there be phase ."
        doc_new = pos_tokenizer(["cardinal"])(doc)

        assert_equal(doc_right, doc_new)
Beispiel #5
0
    def possesive_word_test(self):
        doc = "I am Jack's complete lack of surprise"
        doc_right = "i be Jack complete lack of surprise"
        doc_new = pos_tokenizer(["possessive"])(doc)

        assert_equal(doc_right, doc_new)
Beispiel #6
0
 def implied_verb_test(self):
     # snarfed is not a real word, but we are using like a verb
     doc = "The boy snarfed the ball into the yard"
     doc_right = "the boy the ball into the yard"
     doc_new = pos_tokenizer(["verb"])(doc)
     assert_equal(doc_right, doc_new)
Beispiel #7
0
    def symbol_test(self):
        doc = """I am #1."""
        doc_right = "i be 1 ."
        doc_new = pos_tokenizer(["symbol"])(doc)

        assert_equal(doc_right, doc_new)
Beispiel #8
0
POS_Blacklist = ["connector","cardinal",
                 "pronoun","adverb",
                 "symbol","verb",
                 "punctuation","modal_verb","w_word"]

ABR = nlpre.identify_parenthetical_phrases()(doc2)
key0 = (('systemic', 'lupus', 'erythematosus'), 'SLE')
for n in range(50000):
    ABR[(key0[0],key0[1]+str(n))] += 1

n = 50
data=[]
for key in keys:
    if key =='pos_tokenizer':
        parser = nlpre.pos_tokenizer(POS_Blacklist)
    elif key == "replace_acronyms":
        parser = nlpre.replace_acronyms(ABR)
    else:
        parser = getattr(nlpre, key)()

    if key=='unidecoder':
        func = lambda : [parser(unicode(x)) for x in [doc2]]
    else:
        func = lambda : [parser(x) for x in [doc2]]
    cost = timeit.timeit(func, number=n) / n
    item = {'function':key, "time":cost}
    print item
    data.append(item)
df = pd.DataFrame(data)
df = df.set_index('function').sort_values('time')
    def w_word_test(self):
        doc = "Transcriptions that are observed."
        doc_right = "Transcription be observe ."
        doc_new = pos_tokenizer(["w_word"])(doc)

        assert_equal(doc_right, doc_new.text)
    def possesive_word_test(self):
        doc = "I am Jack's complete lack of surprise"
        doc_right = "be jack complete lack of surprise"
        doc_new = pos_tokenizer(["pronoun"])(doc)

        assert_equal(doc_right, doc_new.text)
    def symbol_test(self):
        doc = '''I am #1.'''
        doc_right = "I be 1 ."
        doc_new = pos_tokenizer(["symbol"])(doc)

        assert_equal(doc_right, doc_new.text)
    def quoted_word_test(self):
        doc = '''We find the answer is "not quite".'''
        doc_right = "We find the answer be not quite ."
        doc_new = pos_tokenizer(["quote"])(doc)

        assert_equal(doc_right, doc_new.text)