def run(): # Dataset generation cls = ['Obama', 'Trump'] obama_sentences = inaugural.sents('2009-Obama.txt') trump_sentences = inaugural.sents('2017-Trump.txt') labelled_obama = [(s, cls[0]) for s in obama_sentences] labelled_trump = [(s, cls[1]) for s in trump_sentences] labelled_data = labelled_obama + labelled_trump trump_test = [ 'We', ',', 'the', 'citizens', 'of', 'America', ',', 'are', 'now', 'joined', 'in', 'a', 'great', 'national', 'effort', 'to', 'rebuild', 'our', 'country', 'and', 'restore', 'its', 'promise', 'for', 'all', 'of', 'our', 'people', '.' ] obama_test = [ 'I', 'stand', 'here', 'today', 'humbled', 'by', 'the', 'task', 'before', 'us', ',', 'grateful', 'for', 'the', 'trust', 'you', 'have', 'bestowed', ',', 'mindful', 'of', 'the', 'sacrifices', 'borne', 'by', 'our', 'ancestors', '.' ] model = Model(labelled_data, cls) model.train() while True: inp = input("Input a string to test: ") doc = tokenize(inp) print(model.test_doc(doc))
def words_per_user_mapper(status_update): user = status_update["username"] for word in tokenize(status_update["text"]): yield (user, (word, 1))
def word_count_old(documents): """word count not using MapReduce""" return Counter(word for document in documents for word in tokenize(document))
def wc_mapper(document): """for each word in the document, emit (word,1)""" for word in tokenize(document): yield (word, 1)
def wc_mapper(document): '''for each word in the doc, emit (word, 1)''' for word in naive_bayes.tokenize(document): yield (word, 1)
def test_tokenize(self): self.assertEqual({'hello', 'it\'s', 'time', 'for', '1000s', 'of'}, naive_bayes.tokenize("Hello it's time for 1000s of HELLO"))
def word_count_mapper(document): for word in tokenize(document): yield (word, 1)