def clean_text(text): if not text: return '' abbreviations = identify_parenthetical_phrases()(text) parsers = [ dedash(), titlecaps(), decaps_text(), unidecoder(), separate_reference(), url_replacement(), replace_acronyms(counter=abbreviations, underscore=False), pos_tokenizer(pre_pos_blacklist), token_replacement(remove=True), replace_from_dictionary(), pos_tokenizer(post_pos_blacklist) ] for parser in parsers: text = parser(text) text = remove_stopwords(text) text = lemmatize(text) return text
def process_list(docList): i = 0 for k,v in docList.items(): start = time.time() inDict = {k:{}} for d in list(dict_loc.keys()): inDict[k][d] = [] #ABBR = identify_parenthetical_phrases()(docList[k]) #parsers = [dedash(), titlecaps(), replace_acronyms(ABBR)] #for f in parsers: # docList[k] = f(docList[k]) for pre,path in dict_loc.items(): #print(pre) docList[k] = replace_from_dictionary(prefix=pre,f_dict=path)(docList[k]) updates = update_term_index(vocab_key=pre,dict_in=inDict,doc_key=k)(doc=docList[k]) docList[k] = updates[0] mainIndex.update( updates[1] ) #do some stuff stop = time.time() duration = stop-start i+=1 print("Record",str(i)," of", len(list(docList.keys()))," - took ", round(duration,3),"seconds")
def setup_class(cls): MeSH_dict = "dictionaries/" local_dir = os.path.dirname(os.path.abspath('nlpre/dictionaries')) f_MeSH = os.path.join( local_dir, MeSH_dict, 'MeSH_two_word_lexicon.csv') cls.replace_MeSH = replace_from_dictionary(f_MeSH, prefix='MeSH_')
def custom_dictionary_test(self): """ Use a custom dictionary. """ clf = replace_from_dictionary("tests/custom_dict.csv") doc = "That person was two faced." doc_right = "That person was two_faced." doc_new = clf(doc) assert_equal(doc_right, doc_new)
def default_dictionary_test(self): ''' Use the default dictionary if one is missing. ''' MeSH = replace_from_dictionary(prefix='MeSH_') doc = '0-beta-Hydroxyethylrutoside is great' doc_right = 'MeSH_Hydroxyethylrutoside is great' doc_new = self.replace_MeSH(doc) assert_equal(doc_right, doc_new)
def default_dictionary_test(self): """ Use the default dictionary if one is missing. """ MeSH = replace_from_dictionary(prefix="MeSH_") doc = "0-beta-Hydroxyethylrutoside is great" doc_right = "MeSH_Hydroxyethylrutoside is great" doc_new = self.replace_MeSH(doc) assert_equal(doc_right, doc_new)
def setup_class(cls): cls.replace_MeSH = replace_from_dictionary(f_MeSH, prefix="MeSH_")