Esempio n. 1
0
 def __init__(self, gram_length=5):
     self.counts = GramNode(None)
     self.gram_length = gram_length
     self.cleaner = Cleaner()
Esempio n. 2
0
 def __init__(self, gram_length=5):
     self.counts = GramNode(None)
     self.gram_length = gram_length
     self.cleaner = Cleaner()
Esempio n. 3
0
class Corpus(object):
    def __init__(self, gram_length=5):
        self.counts = GramNode(None)
        self.gram_length = gram_length
        self.cleaner = Cleaner()

    @staticmethod
    def tokenize_sentence(sentence):
        return BEGIN + nltk.word_tokenize(sentence) + END

    @staticmethod
    def deduce_phrase_type(tokens):
        if tokens[-2] == "?" or tokens[-1] == "?": # -1 may be END
            return QUESTION
        if len(tokens) < 15 and FACT_WORDS.intersection(set(map(lambda t: lower(t), tokens))):
            return FACT
        return DECLARATION

    def add_document(self, doc):
        sentences = self.cleaner.clean_sentences(sentence_splitter.tokenize(doc))

        for s in sentences:
            tokens = self.cleaner.clean_phrase(self.tokenize_sentence(s))

            if len(tokens) < 2:
                continue
            self.add_sentence(tokens)

    def add_sentence(self, tokens, phrase_type=None):
        if type(tokens) is str:
            tokens = self.tokenize_sentence(tokens)
        phrase_type = phrase_type or self.deduce_phrase_type(tokens)

        grams = list(nltk.ngrams(tokens, self.gram_length, pad_right=True, pad_symbol=None))
        for g in grams:
            if None in g:
                g = g[:g.index(None)]
                # we don't need the padding!
            self.counts.get(g).add_occurrence(phrase_type)

    def add_sentences(self, sentences, phrase_type=None):
        for s in sentences:
            self.add_sentence(s, phrase_type)

    def add_prefixes(self, prefixes, phrase_type):
        for p in prefixes:
            tokens = self.tokenize_sentence(p)[:-1] # remove END token
            self.add_sentence(tokens, phrase_type)

    def add_suffixes(self, suffixes, phrase_type):
        for s in suffixes:
            tokens = self.tokenize_sentence(s)[1:] # remove BEGIN token
            self.add_sentence(tokens, phrase_type)

    def pick_next_token(self, previous, phrase_type):
        return self.counts.pick_best(previous, phrase_type)

    def generate_sentence(self, phrase_type, citation_name="Socrates"):
        words = BEGIN + BEGIN
        while words[-1] != END[0] and words[-1] != EARLY_END[0]:
            # choose number of previous tokens to consider, trending towards more as our sentence grows
            context = self.gram_length - 1 - math.floor(random.random() ** math.log(len(words)) * (self.gram_length - 1))
            token, node = self.pick_next_token(words[-context:], phrase_type)
            words.append(token)
        words = self.replace_citation_special(words, citation_name)
        return GeneratedSentence.for_tokens(words[1:])

    def replace_citation_special(self, phrase, name):
        year = random.randrange(1600, 2016)
        while CITATION[0] in phrase:
            at = phrase.index(CITATION[0])
            phrase = phrase[0:at] + ["(", name, str(year), ")"] + phrase[at + 1:]
        return phrase

    def word_set(self, node=None):
        node = node or self.counts
        all_words = set(node.children.keys()) - {BEGIN[0], END[0]} # strings only

        for child in node.children.values():
            all_words.update(self.word_set(child))
        return all_words

    def fix_casing(self):
        all_words = self.word_set()

        # if a word only shows up title cased, it is probably a name eg. Socrates
        no_lower = {lower(w) for w in all_words} - {w for w in all_words if islower(w)}
        self.to_lower(self.counts, no_lower)

    def to_lower(self, node, exempt):
        for child in node.children.values():
            self.to_lower(child, exempt)

        # make a list so that we don't get tripped up while deleting/adding keys
        for key in list(node.children.keys()):
            low_key = lower(key)
            if low_key in exempt or low_key == key:
                continue

            node.children[key].merge_into(node.children[low_key])
            del node.children[key]

    def show(self):
        self.counts.show("")
Esempio n. 4
0
class Corpus(object):
    def __init__(self, gram_length=5):
        self.counts = GramNode(None)
        self.gram_length = gram_length
        self.cleaner = Cleaner()

    @staticmethod
    def tokenize_sentence(sentence):
        return BEGIN + nltk.word_tokenize(sentence) + END

    @staticmethod
    def deduce_phrase_type(tokens):
        if tokens[-2] == "?" or tokens[-1] == "?":  # -1 may be END
            return QUESTION
        if len(tokens) < 15 and FACT_WORDS.intersection(
                set(map(lambda t: lower(t), tokens))):
            return FACT
        return DECLARATION

    def add_document(self, doc):
        sentences = self.cleaner.clean_sentences(
            sentence_splitter.tokenize(doc))

        for s in sentences:
            tokens = self.cleaner.clean_phrase(self.tokenize_sentence(s))

            if len(tokens) < 2:
                continue
            self.add_sentence(tokens)

    def add_sentence(self, tokens, phrase_type=None):
        if type(tokens) is str:
            tokens = self.tokenize_sentence(tokens)
        phrase_type = phrase_type or self.deduce_phrase_type(tokens)

        grams = list(
            nltk.ngrams(tokens,
                        self.gram_length,
                        pad_right=True,
                        pad_symbol=None))
        for g in grams:
            if None in g:
                g = g[:g.index(None)]
                # we don't need the padding!
            self.counts.get(g).add_occurrence(phrase_type)

    def add_sentences(self, sentences, phrase_type=None):
        for s in sentences:
            self.add_sentence(s, phrase_type)

    def add_prefixes(self, prefixes, phrase_type):
        for p in prefixes:
            tokens = self.tokenize_sentence(p)[:-1]  # remove END token
            self.add_sentence(tokens, phrase_type)

    def add_suffixes(self, suffixes, phrase_type):
        for s in suffixes:
            tokens = self.tokenize_sentence(s)[1:]  # remove BEGIN token
            self.add_sentence(tokens, phrase_type)

    def pick_next_token(self, previous, phrase_type):
        return self.counts.pick_best(previous, phrase_type)

    def generate_sentence(self, phrase_type, citation_name="Socrates"):
        words = BEGIN + BEGIN
        while words[-1] != END[0] and words[-1] != EARLY_END[0]:
            # choose number of previous tokens to consider, trending towards more as our sentence grows
            context = self.gram_length - 1 - math.floor(
                random.random()**math.log(len(words)) * (self.gram_length - 1))
            token, node = self.pick_next_token(words[-context:], phrase_type)
            words.append(token)
        words = self.replace_citation_special(words, citation_name)
        return GeneratedSentence.for_tokens(words[1:])

    def replace_citation_special(self, phrase, name):
        year = random.randrange(1600, 2016)
        while CITATION[0] in phrase:
            at = phrase.index(CITATION[0])
            phrase = phrase[0:at] + ["(", name, str(year), ")"
                                     ] + phrase[at + 1:]
        return phrase

    def word_set(self, node=None):
        node = node or self.counts
        all_words = set(node.children.keys()) - {BEGIN[0], END[0]
                                                 }  # strings only

        for child in node.children.values():
            all_words.update(self.word_set(child))
        return all_words

    def fix_casing(self):
        all_words = self.word_set()

        # if a word only shows up title cased, it is probably a name eg. Socrates
        no_lower = {lower(w)
                    for w in all_words} - {w
                                           for w in all_words if islower(w)}
        self.to_lower(self.counts, no_lower)

    def to_lower(self, node, exempt):
        for child in node.children.values():
            self.to_lower(child, exempt)

        # make a list so that we don't get tripped up while deleting/adding keys
        for key in list(node.children.keys()):
            low_key = lower(key)
            if low_key in exempt or low_key == key:
                continue

            node.children[key].merge_into(node.children[low_key])
            del node.children[key]

    def show(self):
        self.counts.show("")