def regex_or_list_maker(verb_list): """makes a regex from the list of words passed to it""" # add alternative spellings from dictionaries.word_transforms import usa_convert from pattern.en import lexeme uk_convert = {v: k for k, v in usa_convert.items()} to_add_to_verb_list = [] for w in verb_list: if w in usa_convert.keys(): to_add_to_verb_list.append(usa_convert[w]) for w in verb_list: if w in uk_convert.keys(): to_add_to_verb_list.append(uk_convert[w]) verb_list = sorted(list(set(verb_list + to_add_to_verb_list))) verbforms = [] for w in verb_list: forms = [form.replace("n't", "").replace(" not", "") for form in lexeme(w)] for f in forms: verbforms.append(f) # deal with contractions if w == 'be': be_conts = [r"'m", r"'re", r"'s"] for cont in be_conts: verbforms.append(cont) if w == "have": have_conts = [r"'d", r"'s", r"'ve"] for cont in have_conts: verbforms.append(cont) to_add = [] for w in verbforms: if w in usa_convert.keys(): to_add.append(usa_convert[w]) for w in verbforms: if w in uk_convert.keys(): to_add.append(uk_convert[w]) verbforms = sorted(list(set(verbforms + to_add))) t = [] # ensure unicode for w in verbforms: if type(w) != unicode: t.append(unicode(w, 'utf-8', errors = 'ignore')) else: t.append(w) verbforms = t if not regex: return verbforms else: return r'(?i)\b(' + r'|'.join(verbforms) + r')\b'
def get_both_spellings(verb_list): """add alternative spellings to verb_list""" from dictionaries.word_transforms import usa_convert uk_convert = {v: k for k, v in usa_convert.items()} to_add_to_verb_list = [] for w in verb_list: if w in usa_convert.keys(): to_add_to_verb_list.append(usa_convert[w]) for w in verb_list: if w in uk_convert.keys(): to_add_to_verb_list.append(uk_convert[w]) verb_list = sorted(list(set(verb_list + to_add_to_verb_list))) return verb_list
def convert_spell(df, convert_to = 'US', print_info = print_info): """turn dataframes into us/uk spelling""" from dictionaries.word_transforms import usa_convert if print_info: print 'Converting spelling ... \n' if convert_to == 'UK': usa_convert = {v: k for k, v in usa_convert.items()} fixed = [] for val in list(df.columns): try: fixed.append(usa_convert[val]) except: fixed.append(val) df.columns = fixed return df
def convert_spell(df, convert_to='US', print_info=print_info): """turn dataframes into us/uk spelling""" from dictionaries.word_transforms import usa_convert if print_info: print('Converting spelling ... \n') if convert_to == 'UK': usa_convert = {v: k for k, v in list(usa_convert.items())} fixed = [] for val in list(df.columns): try: fixed.append(usa_convert[val]) except: fixed.append(val) df.columns = fixed return df
def correct_spelling(a_string): if not spelling: return a_string from dictionaries.word_transforms import usa_convert if spelling.lower() == 'uk': usa_convert = {v: k for k, v in list(usa_convert.items())} spell_out = [] bits = a_string.split('/') for index, i in enumerate(bits): converted = usa_convert.get(i.lower(), i) if i.islower() or preserve_case is False: converted = converted.lower() elif i.isupper() and preserve_case: converted = converted.upper() elif i.istitle() and preserve_case: converted = converted.title() bits[index] = converted r = '/'.join(bits) return r
def add_verb_inflections(verb_list): """add verb inflections to verb_list""" from dictionaries.word_transforms import usa_convert uk_convert = {v: k for k, v in usa_convert.items()} from dictionaries.process_types import find_lexeme # get lexemes lexemes = load_verb_data() verbforms = [] # for each verb, get or guess the inflections # make list of ALL VERBS IN ALL INFLECTIONS all_lists = [lst for lst in lexemes.values()] allverbs = [] for lst in all_lists: for v in lst: if v: allverbs.append(v) allverbs = list(set(allverbs)) # use dict first for w in verb_list: verbforms.append(w) try: wforms = lexemes[w] except KeyError: # if not in dict, if it's an inflection, forget it if w in allverbs: continue if "'" in w: continue # if it's a coinage, guess else: wforms = find_lexeme(w) # get list of unique forms forms = list(set([form.replace("n't", "").replace(" not", "") for form in wforms if form])) for f in forms: verbforms.append(f) # deal with contractions if w == 'be': be_conts = [r"'m", r"'re", r"'s"] for cont in be_conts: verbforms.append(cont) if w == "have": have_conts = [r"'d", r"'s", r"'ve"] for cont in have_conts: verbforms.append(cont) # go over again, and add both possible spellings to_add = [] for w in verbforms: if w in usa_convert.keys(): to_add.append(usa_convert[w]) for w in verbforms: if w in uk_convert.keys(): to_add.append(uk_convert[w]) verbforms = sorted(list(set(verbforms + to_add))) # ensure unicode t = [] for w in verbforms: if type(w) != unicode: t.append(unicode(w, 'utf-8', errors = 'ignore')) else: t.append(w) verbforms = t return verbforms
def add_verb_inflections(verb_list): """add verb inflections to verb_list""" from dictionaries.word_transforms import usa_convert uk_convert = {v: k for k, v in usa_convert.items()} from dictionaries.process_types import find_lexeme # get lexemes lexemes = load_verb_data() verbforms = [] # for each verb, get or guess the inflections # make list of ALL VERBS IN ALL INFLECTIONS all_lists = [lst for lst in lexemes.values()] allverbs = [] for lst in all_lists: for v in lst: if v: allverbs.append(v) allverbs = list(set(allverbs)) # use dict first for w in verb_list: verbforms.append(w) try: wforms = lexemes[w] except KeyError: # if not in dict, if it's an inflection, forget it if w in allverbs: continue if "'" in w: continue # if it's a coinage, guess else: wforms = find_lexeme(w) # get list of unique forms forms = list( set([ form.replace("n't", "").replace(" not", "") for form in wforms if form ])) for f in forms: verbforms.append(f) # deal with contractions if w == 'be': be_conts = [r"'m", r"'re", r"'s"] for cont in be_conts: verbforms.append(cont) if w == "have": have_conts = [r"'d", r"'s", r"'ve"] for cont in have_conts: verbforms.append(cont) # go over again, and add both possible spellings to_add = [] for w in verbforms: if w in usa_convert.keys(): to_add.append(usa_convert[w]) for w in verbforms: if w in uk_convert.keys(): to_add.append(uk_convert[w]) verbforms = sorted(list(set(verbforms + to_add))) # ensure unicode t = [] for w in verbforms: t.append(w) return t