def clean_text(text):
    if not text:
        return ''

    abbreviations = identify_parenthetical_phrases()(text)
    parsers = [
        dedash(),
        titlecaps(),
        decaps_text(),
        unidecoder(),
        separate_reference(),
        url_replacement(),
        replace_acronyms(counter=abbreviations, underscore=False),
        pos_tokenizer(pre_pos_blacklist),
        token_replacement(remove=True),
        replace_from_dictionary(),
        pos_tokenizer(post_pos_blacklist)
    ]

    for parser in parsers:
        text = parser(text)

    text = remove_stopwords(text)
    text = lemmatize(text)

    return text
Example #2
0
def process_list(docList):
    i = 0

    for k,v in docList.items():
        
        start = time.time()
            
        inDict = {k:{}}
        for d in list(dict_loc.keys()):
            inDict[k][d] = []
            
        
        #ABBR = identify_parenthetical_phrases()(docList[k])
    
        #parsers = [dedash(), titlecaps(), replace_acronyms(ABBR)]
        
        #for f in parsers:
        #    docList[k] = f(docList[k])
        for pre,path in dict_loc.items():
            #print(pre)
            docList[k] = replace_from_dictionary(prefix=pre,f_dict=path)(docList[k])
            updates = update_term_index(vocab_key=pre,dict_in=inDict,doc_key=k)(doc=docList[k])
            docList[k] = updates[0]
            mainIndex.update( updates[1] )
        

        #do some stuff
        stop = time.time()
        duration = stop-start
        i+=1
        print("Record",str(i)," of", len(list(docList.keys()))," - took ", round(duration,3),"seconds")
 def setup_class(cls):
     MeSH_dict = "dictionaries/"
     local_dir = os.path.dirname(os.path.abspath('nlpre/dictionaries'))
     f_MeSH = os.path.join(
         local_dir,
         MeSH_dict,
         'MeSH_two_word_lexicon.csv')
     cls.replace_MeSH = replace_from_dictionary(f_MeSH, prefix='MeSH_')
Example #4
0
    def custom_dictionary_test(self):
        """ Use a custom dictionary. """
        clf = replace_from_dictionary("tests/custom_dict.csv")
        doc = "That person was two faced."
        doc_right = "That person was two_faced."
        doc_new = clf(doc)

        assert_equal(doc_right, doc_new)
    def default_dictionary_test(self):
        ''' Use the default dictionary if one is missing. '''
        MeSH = replace_from_dictionary(prefix='MeSH_')

        doc = '0-beta-Hydroxyethylrutoside is great'
        doc_right = 'MeSH_Hydroxyethylrutoside is great'
        doc_new = self.replace_MeSH(doc)

        assert_equal(doc_right, doc_new)
Example #6
0
    def default_dictionary_test(self):
        """ Use the default dictionary if one is missing. """
        MeSH = replace_from_dictionary(prefix="MeSH_")

        doc = "0-beta-Hydroxyethylrutoside is great"
        doc_right = "MeSH_Hydroxyethylrutoside is great"
        doc_new = self.replace_MeSH(doc)

        assert_equal(doc_right, doc_new)
Example #7
0
 def setup_class(cls):
     cls.replace_MeSH = replace_from_dictionary(f_MeSH, prefix="MeSH_")