def __init__(self, gram_length=5): self.counts = GramNode(None) self.gram_length = gram_length self.cleaner = Cleaner()
class Corpus(object): def __init__(self, gram_length=5): self.counts = GramNode(None) self.gram_length = gram_length self.cleaner = Cleaner() @staticmethod def tokenize_sentence(sentence): return BEGIN + nltk.word_tokenize(sentence) + END @staticmethod def deduce_phrase_type(tokens): if tokens[-2] == "?" or tokens[-1] == "?": # -1 may be END return QUESTION if len(tokens) < 15 and FACT_WORDS.intersection(set(map(lambda t: lower(t), tokens))): return FACT return DECLARATION def add_document(self, doc): sentences = self.cleaner.clean_sentences(sentence_splitter.tokenize(doc)) for s in sentences: tokens = self.cleaner.clean_phrase(self.tokenize_sentence(s)) if len(tokens) < 2: continue self.add_sentence(tokens) def add_sentence(self, tokens, phrase_type=None): if type(tokens) is str: tokens = self.tokenize_sentence(tokens) phrase_type = phrase_type or self.deduce_phrase_type(tokens) grams = list(nltk.ngrams(tokens, self.gram_length, pad_right=True, pad_symbol=None)) for g in grams: if None in g: g = g[:g.index(None)] # we don't need the padding! self.counts.get(g).add_occurrence(phrase_type) def add_sentences(self, sentences, phrase_type=None): for s in sentences: self.add_sentence(s, phrase_type) def add_prefixes(self, prefixes, phrase_type): for p in prefixes: tokens = self.tokenize_sentence(p)[:-1] # remove END token self.add_sentence(tokens, phrase_type) def add_suffixes(self, suffixes, phrase_type): for s in suffixes: tokens = self.tokenize_sentence(s)[1:] # remove BEGIN token self.add_sentence(tokens, phrase_type) def pick_next_token(self, previous, phrase_type): return self.counts.pick_best(previous, phrase_type) def generate_sentence(self, phrase_type, citation_name="Socrates"): words = BEGIN + BEGIN while words[-1] != END[0] and words[-1] != EARLY_END[0]: # choose number of previous tokens to consider, trending towards more as our sentence grows context = self.gram_length - 1 - math.floor(random.random() ** math.log(len(words)) * (self.gram_length - 1)) token, node = self.pick_next_token(words[-context:], phrase_type) words.append(token) words = self.replace_citation_special(words, citation_name) return GeneratedSentence.for_tokens(words[1:]) def replace_citation_special(self, phrase, name): year = random.randrange(1600, 2016) while CITATION[0] in phrase: at = phrase.index(CITATION[0]) phrase = phrase[0:at] + ["(", name, str(year), ")"] + phrase[at + 1:] return phrase def word_set(self, node=None): node = node or self.counts all_words = set(node.children.keys()) - {BEGIN[0], END[0]} # strings only for child in node.children.values(): all_words.update(self.word_set(child)) return all_words def fix_casing(self): all_words = self.word_set() # if a word only shows up title cased, it is probably a name eg. Socrates no_lower = {lower(w) for w in all_words} - {w for w in all_words if islower(w)} self.to_lower(self.counts, no_lower) def to_lower(self, node, exempt): for child in node.children.values(): self.to_lower(child, exempt) # make a list so that we don't get tripped up while deleting/adding keys for key in list(node.children.keys()): low_key = lower(key) if low_key in exempt or low_key == key: continue node.children[key].merge_into(node.children[low_key]) del node.children[key] def show(self): self.counts.show("")
class Corpus(object): def __init__(self, gram_length=5): self.counts = GramNode(None) self.gram_length = gram_length self.cleaner = Cleaner() @staticmethod def tokenize_sentence(sentence): return BEGIN + nltk.word_tokenize(sentence) + END @staticmethod def deduce_phrase_type(tokens): if tokens[-2] == "?" or tokens[-1] == "?": # -1 may be END return QUESTION if len(tokens) < 15 and FACT_WORDS.intersection( set(map(lambda t: lower(t), tokens))): return FACT return DECLARATION def add_document(self, doc): sentences = self.cleaner.clean_sentences( sentence_splitter.tokenize(doc)) for s in sentences: tokens = self.cleaner.clean_phrase(self.tokenize_sentence(s)) if len(tokens) < 2: continue self.add_sentence(tokens) def add_sentence(self, tokens, phrase_type=None): if type(tokens) is str: tokens = self.tokenize_sentence(tokens) phrase_type = phrase_type or self.deduce_phrase_type(tokens) grams = list( nltk.ngrams(tokens, self.gram_length, pad_right=True, pad_symbol=None)) for g in grams: if None in g: g = g[:g.index(None)] # we don't need the padding! self.counts.get(g).add_occurrence(phrase_type) def add_sentences(self, sentences, phrase_type=None): for s in sentences: self.add_sentence(s, phrase_type) def add_prefixes(self, prefixes, phrase_type): for p in prefixes: tokens = self.tokenize_sentence(p)[:-1] # remove END token self.add_sentence(tokens, phrase_type) def add_suffixes(self, suffixes, phrase_type): for s in suffixes: tokens = self.tokenize_sentence(s)[1:] # remove BEGIN token self.add_sentence(tokens, phrase_type) def pick_next_token(self, previous, phrase_type): return self.counts.pick_best(previous, phrase_type) def generate_sentence(self, phrase_type, citation_name="Socrates"): words = BEGIN + BEGIN while words[-1] != END[0] and words[-1] != EARLY_END[0]: # choose number of previous tokens to consider, trending towards more as our sentence grows context = self.gram_length - 1 - math.floor( random.random()**math.log(len(words)) * (self.gram_length - 1)) token, node = self.pick_next_token(words[-context:], phrase_type) words.append(token) words = self.replace_citation_special(words, citation_name) return GeneratedSentence.for_tokens(words[1:]) def replace_citation_special(self, phrase, name): year = random.randrange(1600, 2016) while CITATION[0] in phrase: at = phrase.index(CITATION[0]) phrase = phrase[0:at] + ["(", name, str(year), ")" ] + phrase[at + 1:] return phrase def word_set(self, node=None): node = node or self.counts all_words = set(node.children.keys()) - {BEGIN[0], END[0] } # strings only for child in node.children.values(): all_words.update(self.word_set(child)) return all_words def fix_casing(self): all_words = self.word_set() # if a word only shows up title cased, it is probably a name eg. Socrates no_lower = {lower(w) for w in all_words} - {w for w in all_words if islower(w)} self.to_lower(self.counts, no_lower) def to_lower(self, node, exempt): for child in node.children.values(): self.to_lower(child, exempt) # make a list so that we don't get tripped up while deleting/adding keys for key in list(node.children.keys()): low_key = lower(key) if low_key in exempt or low_key == key: continue node.children[key].merge_into(node.children[low_key]) del node.children[key] def show(self): self.counts.show("")