Ejemplo n.º 1
0
def run():

    # Dataset generation
    cls = ['Obama', 'Trump']
    obama_sentences = inaugural.sents('2009-Obama.txt')
    trump_sentences = inaugural.sents('2017-Trump.txt')
    labelled_obama = [(s, cls[0]) for s in obama_sentences]
    labelled_trump = [(s, cls[1]) for s in trump_sentences]
    labelled_data = labelled_obama + labelled_trump

    trump_test = [
        'We', ',', 'the', 'citizens', 'of', 'America', ',', 'are', 'now',
        'joined', 'in', 'a', 'great', 'national', 'effort', 'to', 'rebuild',
        'our', 'country', 'and', 'restore', 'its', 'promise', 'for', 'all',
        'of', 'our', 'people', '.'
    ]
    obama_test = [
        'I', 'stand', 'here', 'today', 'humbled', 'by', 'the', 'task',
        'before', 'us', ',', 'grateful', 'for', 'the', 'trust', 'you', 'have',
        'bestowed', ',', 'mindful', 'of', 'the', 'sacrifices', 'borne', 'by',
        'our', 'ancestors', '.'
    ]

    model = Model(labelled_data, cls)
    model.train()

    while True:
        inp = input("Input a string to test: ")
        doc = tokenize(inp)
        print(model.test_doc(doc))
Ejemplo n.º 2
0
def words_per_user_mapper(status_update):
    user = status_update["username"]
    for word in tokenize(status_update["text"]):
        yield (user, (word, 1))
Ejemplo n.º 3
0
def word_count_old(documents):
    """word count not using MapReduce"""
    return Counter(word
                   for document in documents
                   for word in tokenize(document))
Ejemplo n.º 4
0
def wc_mapper(document):
    """for each word in the document, emit (word,1)"""
    for word in tokenize(document):
        yield (word, 1)
Ejemplo n.º 5
0
def wc_mapper(document):
	'''for each word in the doc, emit (word, 1)'''
	for word in naive_bayes.tokenize(document):
		yield (word, 1)
def words_per_user_mapper(status_update):
    user = status_update["username"]
    for word in tokenize(status_update["text"]):
        yield (user, (word, 1))
def word_count_old(documents):
    """word count not using MapReduce"""
    return Counter(word 
        for document in documents 
        for word in tokenize(document))
def wc_mapper(document):
    """for each word in the document, emit (word,1)"""        
    for word in tokenize(document):
        yield (word, 1)
 def test_tokenize(self):
     self.assertEqual({'hello', 'it\'s', 'time', 'for', '1000s', 'of'},
                      naive_bayes.tokenize("Hello it's time for 1000s of HELLO"))
def word_count_mapper(document):
    for word in tokenize(document):
        yield (word, 1)