def regex_or_list_maker(verb_list): """makes a regex from the list of words passed to it""" # add alternative spellings from dictionaries.word_transforms import usa_convert from pattern.en import lexeme uk_convert = {v: k for k, v in usa_convert.items()} to_add_to_verb_list = [] for w in verb_list: if w in usa_convert.keys(): to_add_to_verb_list.append(usa_convert[w]) for w in verb_list: if w in uk_convert.keys(): to_add_to_verb_list.append(uk_convert[w]) verb_list = sorted(list(set(verb_list + to_add_to_verb_list))) verbforms = [] for w in verb_list: forms = [form.replace("n't", "").replace(" not", "") for form in lexeme(w)] for f in forms: verbforms.append(f) # deal with contractions if w == 'be': be_conts = [r"'m", r"'re", r"'s"] for cont in be_conts: verbforms.append(cont) if w == "have": have_conts = [r"'d", r"'s", r"'ve"] for cont in have_conts: verbforms.append(cont) to_add = [] for w in verbforms: if w in usa_convert.keys(): to_add.append(usa_convert[w]) for w in verbforms: if w in uk_convert.keys(): to_add.append(uk_convert[w]) verbforms = sorted(list(set(verbforms + to_add))) t = [] # ensure unicode for w in verbforms: if type(w) != unicode: t.append(unicode(w, 'utf-8', errors = 'ignore')) else: t.append(w) verbforms = t if not regex: return verbforms else: return r'(?i)\b(' + r'|'.join(verbforms) + r')\b'
def get_both_spellings(verb_list): """add alternative spellings to verb_list""" from dictionaries.word_transforms import usa_convert uk_convert = {v: k for k, v in usa_convert.items()} to_add_to_verb_list = [] for w in verb_list: if w in usa_convert.keys(): to_add_to_verb_list.append(usa_convert[w]) for w in verb_list: if w in uk_convert.keys(): to_add_to_verb_list.append(uk_convert[w]) verb_list = sorted(list(set(verb_list + to_add_to_verb_list))) return verb_list
def add_verb_inflections(verb_list): """add verb inflections to verb_list""" from dictionaries.word_transforms import usa_convert uk_convert = {v: k for k, v in usa_convert.items()} from dictionaries.process_types import find_lexeme # get lexemes lexemes = load_verb_data() verbforms = [] # for each verb, get or guess the inflections # make list of ALL VERBS IN ALL INFLECTIONS all_lists = [lst for lst in lexemes.values()] allverbs = [] for lst in all_lists: for v in lst: if v: allverbs.append(v) allverbs = list(set(allverbs)) # use dict first for w in verb_list: verbforms.append(w) try: wforms = lexemes[w] except KeyError: # if not in dict, if it's an inflection, forget it if w in allverbs: continue if "'" in w: continue # if it's a coinage, guess else: wforms = find_lexeme(w) # get list of unique forms forms = list(set([form.replace("n't", "").replace(" not", "") for form in wforms if form])) for f in forms: verbforms.append(f) # deal with contractions if w == 'be': be_conts = [r"'m", r"'re", r"'s"] for cont in be_conts: verbforms.append(cont) if w == "have": have_conts = [r"'d", r"'s", r"'ve"] for cont in have_conts: verbforms.append(cont) # go over again, and add both possible spellings to_add = [] for w in verbforms: if w in usa_convert.keys(): to_add.append(usa_convert[w]) for w in verbforms: if w in uk_convert.keys(): to_add.append(uk_convert[w]) verbforms = sorted(list(set(verbforms + to_add))) # ensure unicode t = [] for w in verbforms: if type(w) != unicode: t.append(unicode(w, 'utf-8', errors = 'ignore')) else: t.append(w) verbforms = t return verbforms
def add_verb_inflections(verb_list): """add verb inflections to verb_list""" from dictionaries.word_transforms import usa_convert uk_convert = {v: k for k, v in usa_convert.items()} from dictionaries.process_types import find_lexeme # get lexemes lexemes = load_verb_data() verbforms = [] # for each verb, get or guess the inflections # make list of ALL VERBS IN ALL INFLECTIONS all_lists = [lst for lst in lexemes.values()] allverbs = [] for lst in all_lists: for v in lst: if v: allverbs.append(v) allverbs = list(set(allverbs)) # use dict first for w in verb_list: verbforms.append(w) try: wforms = lexemes[w] except KeyError: # if not in dict, if it's an inflection, forget it if w in allverbs: continue if "'" in w: continue # if it's a coinage, guess else: wforms = find_lexeme(w) # get list of unique forms forms = list( set([ form.replace("n't", "").replace(" not", "") for form in wforms if form ])) for f in forms: verbforms.append(f) # deal with contractions if w == 'be': be_conts = [r"'m", r"'re", r"'s"] for cont in be_conts: verbforms.append(cont) if w == "have": have_conts = [r"'d", r"'s", r"'ve"] for cont in have_conts: verbforms.append(cont) # go over again, and add both possible spellings to_add = [] for w in verbforms: if w in usa_convert.keys(): to_add.append(usa_convert[w]) for w in verbforms: if w in uk_convert.keys(): to_add.append(uk_convert[w]) verbforms = sorted(list(set(verbforms + to_add))) # ensure unicode t = [] for w in verbforms: t.append(w) return t