Ejemplo n.º 1
0
    def regex_or_list_maker(verb_list):
        """makes a regex from the list of words passed to it"""
        # add alternative spellings
        from dictionaries.word_transforms import usa_convert
        from pattern.en import lexeme
        uk_convert = {v: k for k, v in usa_convert.items()}
        to_add_to_verb_list = []
        for w in verb_list:
            if w in usa_convert.keys():
              to_add_to_verb_list.append(usa_convert[w])
        for w in verb_list:
            if w in uk_convert.keys():
              to_add_to_verb_list.append(uk_convert[w])
        verb_list = sorted(list(set(verb_list + to_add_to_verb_list)))

        verbforms = []
        for w in verb_list:
          forms = [form.replace("n't", "").replace(" not", "") for form in lexeme(w)]
          for f in forms:
              verbforms.append(f)
          # deal with contractions
          if w == 'be':
              be_conts = [r"'m", r"'re", r"'s"]
              for cont in be_conts:
                  verbforms.append(cont)
          if w == "have":
              have_conts = [r"'d", r"'s", r"'ve"]
              for cont in have_conts:
                  verbforms.append(cont)
        
        to_add = []
        for w in verbforms:
            if w in usa_convert.keys():
              to_add.append(usa_convert[w])
        for w in verbforms:
            if w in uk_convert.keys():
              to_add.append(uk_convert[w])
        verbforms = sorted(list(set(verbforms + to_add)))
        t = []

        # ensure unicode
        for w in verbforms:
            if type(w) != unicode:
                t.append(unicode(w, 'utf-8', errors = 'ignore'))
            else:
                t.append(w)
        verbforms = t
        
        if not regex:
            return verbforms
        else:
            return r'(?i)\b(' + r'|'.join(verbforms) + r')\b'
Ejemplo n.º 2
0
def get_both_spellings(verb_list):
    """add alternative spellings to verb_list"""
    from dictionaries.word_transforms import usa_convert
    uk_convert = {v: k for k, v in usa_convert.items()}
    to_add_to_verb_list = []
    for w in verb_list:
        if w in usa_convert.keys():
          to_add_to_verb_list.append(usa_convert[w])
    for w in verb_list:
        if w in uk_convert.keys():
          to_add_to_verb_list.append(uk_convert[w])
    verb_list = sorted(list(set(verb_list + to_add_to_verb_list)))
    return verb_list
Ejemplo n.º 3
0
def get_both_spellings(verb_list):
    """add alternative spellings to verb_list"""
    from dictionaries.word_transforms import usa_convert
    uk_convert = {v: k for k, v in usa_convert.items()}
    to_add_to_verb_list = []
    for w in verb_list:
        if w in usa_convert.keys():
            to_add_to_verb_list.append(usa_convert[w])
    for w in verb_list:
        if w in uk_convert.keys():
            to_add_to_verb_list.append(uk_convert[w])
    verb_list = sorted(list(set(verb_list + to_add_to_verb_list)))
    return verb_list
Ejemplo n.º 4
0
def add_verb_inflections(verb_list):
    """add verb inflections to verb_list"""
    from dictionaries.word_transforms import usa_convert
    uk_convert = {v: k for k, v in usa_convert.items()}
    from dictionaries.process_types import find_lexeme
    
    # get lexemes
    lexemes = load_verb_data()
    verbforms = []
    
    # for each verb, get or guess the inflections
    # make list of ALL VERBS IN ALL INFLECTIONS
    all_lists = [lst for lst in lexemes.values()]
    allverbs = []
    for lst in all_lists:
        for v in lst:
            if v:
                allverbs.append(v)
    allverbs = list(set(allverbs))
    # use dict first
    for w in verb_list:
        verbforms.append(w)
        try:
            wforms = lexemes[w]
        except KeyError:
            # if not in dict, if it's an inflection, forget it
            if w in allverbs:
                continue
            if "'" in w:
                continue
            # if it's a coinage, guess
            else:
                wforms = find_lexeme(w)
        # get list of unique forms
        forms = list(set([form.replace("n't", "").replace(" not", "") for form in wforms if form]))
      
        for f in forms:
            verbforms.append(f)
      
      # deal with contractions
        if w == 'be':
            be_conts = [r"'m", r"'re", r"'s"]
            for cont in be_conts:
                verbforms.append(cont)
        if w == "have":
            have_conts = [r"'d", r"'s", r"'ve"]
            for cont in have_conts:
                verbforms.append(cont)
    
    # go over again, and add both possible spellings
    to_add = []
    for w in verbforms:
        if w in usa_convert.keys():
          to_add.append(usa_convert[w])
    for w in verbforms:
        if w in uk_convert.keys():
          to_add.append(uk_convert[w])
    verbforms = sorted(list(set(verbforms + to_add)))

    # ensure unicode
    t = []
    for w in verbforms:
        if type(w) != unicode:
            t.append(unicode(w, 'utf-8', errors = 'ignore'))
        else:
            t.append(w)
    verbforms = t
    return verbforms
Ejemplo n.º 5
0
def add_verb_inflections(verb_list):
    """add verb inflections to verb_list"""
    from dictionaries.word_transforms import usa_convert
    uk_convert = {v: k for k, v in usa_convert.items()}
    from dictionaries.process_types import find_lexeme

    # get lexemes
    lexemes = load_verb_data()
    verbforms = []

    # for each verb, get or guess the inflections
    # make list of ALL VERBS IN ALL INFLECTIONS
    all_lists = [lst for lst in lexemes.values()]
    allverbs = []
    for lst in all_lists:
        for v in lst:
            if v:
                allverbs.append(v)
    allverbs = list(set(allverbs))
    # use dict first
    for w in verb_list:
        verbforms.append(w)
        try:
            wforms = lexemes[w]
        except KeyError:
            # if not in dict, if it's an inflection, forget it
            if w in allverbs:
                continue
            if "'" in w:
                continue
            # if it's a coinage, guess
            else:
                wforms = find_lexeme(w)
        # get list of unique forms
        forms = list(
            set([
                form.replace("n't", "").replace(" not", "") for form in wforms
                if form
            ]))

        for f in forms:
            verbforms.append(f)

    # deal with contractions
        if w == 'be':
            be_conts = [r"'m", r"'re", r"'s"]
            for cont in be_conts:
                verbforms.append(cont)
        if w == "have":
            have_conts = [r"'d", r"'s", r"'ve"]
            for cont in have_conts:
                verbforms.append(cont)

    # go over again, and add both possible spellings
    to_add = []
    for w in verbforms:
        if w in usa_convert.keys():
            to_add.append(usa_convert[w])
    for w in verbforms:
        if w in uk_convert.keys():
            to_add.append(uk_convert[w])
    verbforms = sorted(list(set(verbforms + to_add)))

    # ensure unicode
    t = []
    for w in verbforms:
        t.append(w)
    return t