def extract_rule(conn, corpus_id, letter_sound_map, raw_pair): rule_key = first(raw_pair) value = second(raw_pair) rule = None if rule_key == 'rhyme': rule = dogma.RhymeRule(letter_sound_map.get(value)) elif rule_key == 'blank': rule = dogma.BlankRule() elif rule_key == 'alliteration': rule = dogma.AlliterationRule(value) elif rule_key == 'keyword': rule = dogma.KeywordRule(value, conn, corpus_id) elif rule_key == 'fuzzy': rule = dogma.FuzzyKeywordRule(value, conn, corpus_id) elif rule_key == 'syllables': rule = dogma.SyllableCountRule(value) return rule
def words(sentence): tagged_sentence = tag(sentence) tagged_words = filter(lambda tu: match(word_tag_re, second(tu)), tagged_sentence) ws = map(first, tagged_words) return list(ws)
@lru_cache(maxsize=2056) def words(sentence): tagged_sentence = tag(sentence) tagged_words = filter(lambda tu: match(word_tag_re, second(tu)), tagged_sentence) ws = map(first, tagged_words) return list(ws) def stem_sentence(sentence): stemmed = map(stem_word, words(sentence)) return list(stemmed) is_divider = lambda tu: DIVIDER_TAG == second(tu) def split_multiclause(sentence, tagged_sentence): # extract the text the divider tag represents divider = first(find_first(is_divider, tagged_sentence)) if divider is not None: first_clause = sentence[0:sentence.index(divider)].rstrip() second_clause = sentence[sentence.index(divider) + 1:].lstrip() return [first_clause, second_clause] else: return [sentence] def expand_multiclauses(sentences): # TODO consider itertools
tokenized_words = nltk.word_tokenize(sentence_string) return nltk.pos_tag(tokenized_words) word_tag_re = re.compile("^[A-Z]+$") @lru_cache(maxsize=2056) def words(sentence): tagged_sentence = tag(sentence) tagged_words = filter(lambda tu: match(word_tag_re, second(tu)), tagged_sentence) ws = map(first, tagged_words) return list(ws) def stem_sentence(sentence): stemmed = map(stem_word, words(sentence)) return list(stemmed) is_divider = lambda tu: DIVIDER_TAG == second(tu) def split_multiclause(sentence, tagged_sentence): # extract the text the divider tag represents divider = first(find_first(is_divider, tagged_sentence)) if divider is not None: first_clause = sentence[0:sentence.index(divider)].rstrip() second_clause = sentence[sentence.index(divider)+1:].lstrip() return [first_clause, second_clause] else: return [sentence] def expand_multiclauses(sentences): # TODO consider itertools split = [] for sentence in sentences: