コード例 #1
0
def exercise0():
	# part a
	

	# part b
	# reference https://github.com/mikeholler/CSC499-NLP/blob/master/ch_6/exercises/improved_movie_review_classifier.py
	# by mjholler
	
	# retrieve all movie reviews in the form of (wordlist, category)
	documents = [(list(nltk.corpus.movie_reviews.words(fileid)), category)
		for category in nltk.corpus.movie_reviews.categories()
		for fileid in nltk.corpus.movie_reviews.fileids(category)
	]
	

	#all_words = nltk.FreqDist(w.lower() for w in nltk.corpus.movie_reviews.words())
	#word_features = all_words.keys()[:2000]

	# Get the top synsets in the document from the top 2000 words
	#synset_features = synsets(word_features)

	train_set = apply_features(document_features, documents[100:])
	test_set = apply_features(document_features, documents[:100])

	print 'training classifier'
	classifier = nltk.NaiveBayesClassifier.train(train_set)

	print 'accuracy', nltk.classify.accuracy(classifier, test_set)
	print '10 features', classifier.show_most_informative_features(10)
コード例 #2
0
def get_labeled_examples():
    for f in filenames:
        print f,
        print labels[f]

    words = []
    for f in filenames:
        for word in words_in_lines(
                open(f).readlines()):  #sacrifice preformance for memory
            words.append(word)
    all_words = FreqDist(w.lower() for w in words)
    word_features = all_words.keys()[:2000]  # 2000 most frequent words
    get_features = get_get_features(word_features)  # create feature extractor

    #pair sentences and labels
    labeled_examples = []
    for f in filenames:
        labeled_examples.extend([(ex, labels[f])
                                 for ex in open(f).readlines()])
    shuffle(labeled_examples)
    #extract features
    labeled_instances = apply_features(get_features,
                                       labeled_examples,
                                       labeled=True)  #lazy map
    return (labeled_instances, word_features)
コード例 #3
0
def entry_reduce(key, raw_values):

    values = []
    for raw_value in raw_values:
        logging.error(raw_value)
        logging.error(type(raw_value))
        value = eval(raw_value)
        values.append(value)

    tweets = []
    for(words,sentiment)in values:
        words_filtered=[e.lower() for e in words.split() if len(e)>=3]
        tweets.append((words_filtered ,sentiment))

    word_features = get_word_features(get_words_in_tweets(tweets))

    def extract_features(document):
        logging.error(document)
        document_words = set(document)
        features = {}
        for word in word_features:
            logging.error(word)
            features['contains(%s)' % word] = (word in document_words)
        return features

    training_set = apply_features(extract_features, tweets)
    logging.error(training_set)
    classifier = nltk.classify.NaiveBayesClassifier.train(training_set)
    setattr(classifier, 'category', key)
    yield pickle.dumps(classifier)
コード例 #4
0
ファイル: sa.py プロジェクト: corne12345/FoDS1
def get_trained_classifier():
    df = pd.read_csv("sentiment.csv")

    cleanTestDict = []
    for text in df['text']:
        cleanTestDict.append(tweet_cleaner_updated(text))
    df['text_clean'] = cleanTestDict

    lemmer(df)
    stemmer(df)

    df_final = []
    type = 'lemmed'
    global word_features
    word_features = get_word_features(get_words_in_tweets(df, type))
    for index, row in df.iterrows():
        if row.sentiment == 0:
            df_final.append((row['text_' + type], 'negative'))
        elif row.sentiment == 4:
            df_final.append((row['text_' + type], 'positive'))

    training_set = apply_features(extract_features, df_final)
    classifier = NaiveBayesClassifier.train(training_set)
    classifier.show_most_informative_features()
    return classifier
コード例 #5
0
def avaliar_Sentimento(message):
    training_set = apply_features(extract_features,lista_feature_fell)
    classifier= nltk.NaiveBayesClassifier.train(training_set)
    print("\n\tPossibilidades: %s "%classifier.labels())
    print ("\n\tSentimento Provavel: %s \n"%(classifier.classify(extract_features(message))))
    #print ("Accuracy : %s" %nltk.classify.util.accuracy(classifier,training_set))
    #print extract_features(message)
    print classifier.show_most_informative_features(32)
コード例 #6
0
    def train(raw_classifier, training_sets, feature_selection, transformer):
        training = []

        # Since we have a rather large amount of training data, build features
        # lazily to avoid running out of memory.
        tuple_set = [(transformer.transform(x), cl)
                             for cl in [POS, NEG]
                             for x in training_sets[cl]]
        train_set = apply_features(feature_selection.select_features, tuple_set)

        return Classifier(raw_classifier.train(train_set), feature_selection,
                transformer, len(tuple_set))
コード例 #7
0
    def analisar_frase(self, widget):
        """Função: analisar a frase que o usuário"""
        print ("analisar_frase")
        global featureList
        frase = self.text_area.get_text()
        if ( frase != ""):
            frase_proc= normalizar(frase)
            self.text_area.set_text(frase)
            if (self.opcao == 'dilma' or self.opcao == 'copa' or self.opcao == 'palmeiras' or self.opcao == 'fatec'):
                print("Opcao depois: %s "%self.opcao)

              # Gera a lista de caracteristicas usada no metodo extract_features
                featureList = gera_lista_features(self.opcao)
                #print ("\n\tCaracteristicas conhecidas:\n\t%s "%(featureList))
                lista_feature_fell = get_lista_feature_fell()
                print("\n\tCaracteristica / Sentimento:\n\t %s"%lista_feature_fell)

              #  features = get_feature_list(self.opcao)
              #  print("\n\tFeatureListtt: %s "%(features))
           
                frase = self.text_area.get_text()
                frase_Normal = normalizar(frase)
                features_msg = getFeatureVector(frase_Normal)
                training_set = apply_features(extract_features,lista_feature_fell)
                self.fell = avaliar_Sentimento(features_msg,training_set)
                print ("\nFrase analisada: %s "%frase)
                print ("\n\tCaracteristicas da Msg - %s\n "%features_msg)

                language = detect_language(frase)
                print ("\n\tLingua: %s "%language)        
                if ( language == 'portuguese'):
                    print ("Sentimento: %s "%self.fell)
                    # text color
                    self.text_area.modify_text(gtk.STATE_NORMAL, gtk.gdk.color_parse("#FF0000"))
                    self.text_area2.modify_text(gtk.STATE_NORMAL, gtk.gdk.color_parse("#FF0000"))
                    # text font
                    # entry.modify_font(pango.FontDescription("monospace 16"))
                    self.text_area2.modify_font(pango.FontDescription("sans bold  16"))
                    if ( self.fell == 'negativo'):
                        frase = self.text_area.get_text()
                        self.text_area.set_text(frase)
                        self.text_area2.set_text("			 Sentimento: Negativo")
                        self.image_happy.set_from_file('../imgs/black.jpg')
                        self.image_happy.show
                     
                    else:
                        self.text_area2.set_text("			 Sentimento: Positivo")
                        self.image_sad.set_from_file('../imgs/black.jpg')
                        self.image_sad.show
                else:
                    self.text_area2.set_text(" Ce ta de Brincaxxon comigo ?????")     
コード例 #8
0
def classifier_for_training_set(positive, negative, blacklist=[]):
    """
    Returns a Bayesian classifier for the given positive and negative sentences.
    """
    positive_feedback \
        = map(lambda s: (FreqDist(tokenize(s, blacklist)).keys(), 'positive'),
              positive)
    negative_feedback \
        = map(lambda s: (FreqDist(tokenize(s, blacklist)).keys(), 'negative'),
              negative)

    training_set = apply_features(_classifier_features_for_document,
                                  positive_feedback + negative_feedback)
    return nltk.classify.NaiveBayesClassifier.train(training_set)
コード例 #9
0
    def train(self, text, labels):
        """
        Returns trained model and set of unique words in training data
        """
        #assert corpus

        self.corpus = self.extract_tokens(text, labels)

        self.word_features = self.get_features(self.corpus)

        train_set = apply_features(self.extract_features, self.corpus)

        self.classifier = NaiveBayesClassifier.train(train_set)

        return self.classifier, self.word_features
コード例 #10
0
    def apply_features(self, documents, labeled=None):
        """
        Apply all feature extractor functions to the documents. This is a wrapper
        around `nltk.classify.util.apply_features`.

        If `labeled=False`, return featuresets as:
            [feature_func(doc) for doc in documents]
        If `labeled=True`, return featuresets as:
            [(feature_func(tok), label) for (tok, label) in toks]

        :param documents: a list of documents. `If labeled=True`, the method expects
            a list of (words, label) tuples.
        :rtype: LazyMap
        """
        return apply_features(self.extract_features, documents, labeled)
コード例 #11
0
ファイル: sentiment_analyzer.py プロジェクト: sahitpj/nltk
    def apply_features(self, documents, labeled=None):
        """
        Apply all feature extractor functions to the documents. This is a wrapper
        around `nltk.classify.util.apply_features`.

        If `labeled=False`, return featuresets as:
            [feature_func(doc) for doc in documents]
        If `labeled=True`, return featuresets as:
            [(feature_func(tok), label) for (tok, label) in toks]

        :param documents: a list of documents. `If labeled=True`, the method expects
            a list of (words, label) tuples.
        :rtype: LazyMap
        """
        return apply_features(self.extract_features, documents, labeled)
コード例 #12
0
    def train(self, text, labels):
        """
        Returns trained model and set of unique words in training data
        """
        #call extract_tokens
        self.corpus = self.extract_tokens(text, labels)

        #call get_features
        self.word_features = self.get_features(self.corpus)

        #Extracting training set
        train_set = apply_features(self.extract_features, self.corpus)

        #Now train the NaiveBayesClassifier with train_set
        self.classifier = NaiveBayesClassifier.train(train_set)

        return self.classifier, self.word_features
コード例 #13
0
    def train(self, train, test_ratio):
        """
            trains the chosen algorithm
        :param train: classification algorithm
        :param test_ratio: ratio of test documents from dataset
        :return:
        """
        dataset_dict = self.process(self.dataset_dict)

        # label each document with the coresponding class
        for class_label, documents in dataset_dict.items():
            dataset_dict[class_label] = [(document, class_label)
                                         for document in documents]

        #split dataset in test and training data. Get dict {"class_label" : [training_data], [test_data]"
        instances = defaultdict(list)
        for class_label, documents in dataset_dict.items():
            train_data, test_data = split_data(documents, test_ratio)
            instances[InstanceType.
                      Train] = instances[InstanceType.Train] + train_data
            instances[
                InstanceType.Test] = instances[InstanceType.Test] + test_data

        tokenized_train_documents = [
            training_document
            for training_document in instances[InstanceType.Train]
        ]

        training_words = [
            word for word in find_all_words(tokenized_train_documents)
        ]

        unigram_feats = unigram_word_feats(training_words, top_n=2000)
        self.add_feat_extractor(
            self.feature_extractors_dict[FeatureExtractor.Unigram],
            unigrams=unigram_feats)

        #bigram_collocs_feats = bigram_collocation_feats([tokenized_train_tweet[0] for tokenized_train_tweet in tokenized_train_documents], top_n=1000, min_freq=5)
        #self.add_feat_extractor(self.feature_extractors_dict[FeatureExtractor.Bigram], bigrams=bigram_collocs_feats)

        training_set = apply_features(self.extract_features,
                                      tokenized_train_documents)
        self.classifier = train(training_set)

        labels_total = defaultdict(int)
        errors = defaultdict(int)
        for instance in instances[InstanceType.Test]:
            if len(instance[0]) > 1:
                classification = self.classify(instance[0])
                if classification != instance[1]:
                    errors[instance[1]] += 1
                labels_total[instance[1]] += 1

        acc_up = 0
        acc_down = 0

        for label, error in errors.items():
            total = labels_total[label]
            acc_up += total
            acc_down += total + error
            precision = ((total - error) / total) * 100
            print("{} class precision: {}".format(label, precision))

        accuracy = (acc_up / acc_down) * 100
        print("Accuracy: {}".format(accuracy))
コード例 #14
0
 def _get_trainingset(cls, source='db'):
     data = cls._get_data(source)
     cls._get_word_freq(data)
     return apply_features(cls.feature_extractor, data)
コード例 #15
0
    #Create a dictionary of features (True for each feature present, implicit False for absent features).  In this case, features are words, but they could be bigger or smaller, simpler or more complex.

    for word in words:
        # filter out words of less than 2 characters, see impact on accuracy
        if len(word) >= 3: features["contains_word_(%s)" % word] = True

    return features


raw_input("\n\nHit enter to continue...")

print "Extracting Features from Training Set"


train_set = apply_features(feature_extracting_function, training_set_array)


raw_input("\n\nHit enter to continue...")

print "Gathering unknown data points (new data) to predict on (again, hand-coded, see script source)"




raw_input("\n\nHit enter to continue...")

#Train a Naive Bayes Classifier (simple but surprisingly effective).  This isn't the only classifier one could use (dtree is another, and there are many, many more), but it's a good start.

# print "Training Naive Bayes Classifier"
コード例 #16
0
print "Writing Feature Extractor"
def feature_extracting_function(data_point):
    features = {} #Dictionary, roughly equivalent to a hashtable in other languages.
    data_point = ''.join(ch for ch in data_point if ch not in set(string.punctuation)) #Strip punctuation characters from the string. In Python, this happens to be usually done with a .join on the string object, but don't be thrown if you're used to other languages and this looks weird (hell, it looks weird to me), all we're doing is stripping punctuation.
    words = data_point.split() #Split data_point on whitespace, return as list
    words = [word.lower() for word in words] #Convert all words in list to lowercase.  The [] syntax is a Python "list comprehension"; Google that phrase if you're confused.

    #Create a dictionary of features (True for each feature present, implicit False for absent features).  In this case, features are words, but they could be bigger or smaller, simpler or more complex.
    for word in words:
        features["contains_word_(%s)" % word] = True
    return features


raw_input("\n\nHit enter to continue...")
print "Extracting Features from Training Set"
train_set = apply_features(feature_extracting_function, known_data_points)


raw_input("\n\nHit enter to continue...")
print "Gathering unknown data points (new data) to predict on (again, hand-coded, see script source)"
#Our query chocolate bars: we want to know whether or not they're matches
unknown_1 = "milky light sweet nutty"
unknown_2 = "dark bitter plain"
unknown_3 = "dark dark bitter beyond belief organic"
unknown_4 = "organic minty sweet dark"


raw_input("\n\nHit enter to continue...")
#Train a Naive Bayes Classifier (simple but surprisingly effective).  This isn't the only classifier one could use (dtree is another, and there are many, many more), but it's a good start.
print "Training Naive Bayes Classifier"
nb = nltk.NaiveBayesClassifier.train(train_set)
コード例 #17
0
# Event
# Hackbright Data Science Workshop.
# Author
# Daniel Wiesenthal.  [email protected].
# What is this?
# This is a simple script illustrating the usage of the Python NLTK classifier.  It is written in Python, but the comments are intended to make it clear how to port to other languages.  The flow isn't particularly well decomposed as a program; rather, it is intended to go along linearly with the associated talk/presentation.
# The goal is to find out which chocolate a particular volunteer during the talk will like.  We have a few examples of chocolate bars that we know are either matches or not (misses), and want to use that to make a guess about an unknown bar (we don't know if it will be a match, and want to guess).
# Further reading:
# http://www.stanford.edu/class/cs124/lec/naivebayes.pdf
# http://nltk.googlecode.com/svn/trunk/doc/book/ch06.html
# Software Setup
# For this script to work, you'll need to have Python, NLTK (a Python package), and Numpy (upon which NLTK depends) installed.  On a Mac (which all have numpy pre-installed these days), run:
# sudo easy_install pip
# sudo pip install nltk
# <cd to directory with this file>
# python classification_101.py
#Required libraries
from myclassifier import MyClassifier
import pprint
try:
    import nltk
    from nltk.classify.util import apply_features
    import string
    print "Great!  Looks like you're all set re: NLTK and Python."
except Exception, e:
    print "Bummer.  Looks like you don't have NLTK and Python set up correctly.  (Exception: " + str(
        e) + ")"
    quit()
raw_input("\n\nHit enter to get started...")
#Some example chocolate bars.  The format is a tuple of {information about the chocolate bar} and a {value}, where "match" is a good match and "miss" is a poor/bad match.
コード例 #18
0
    def analisar_frase(self, widget):
        """Função: analisar a frase que o usuário"""
        # Limpar a tela
        subprocess.call("clear")
        print ("\n\tAnalise")
        global featureList
        frase = self.text_area.get_text()
        if ( frase != ""):
            frase_proc= normalizar(frase)
            self.text_area.set_text(frase)
            if (self.opcao == 'dilma' or self.opcao == 'copa' or self.opcao == 'palmeiras' or self.opcao == 'fatec'):
                #print("Opcao depois: %s "%self.opcao)

              # Gera a lista de caracteristicas usada no metodo extract_features
                featureList = gera_lista_features(self.opcao)
                #print ("\n\tCaracteristicas conhecidas:\n\t%s "%(featureList))
                lista_feature_fell = get_lista_feature_fell()
                #print("\n\tCaracteristica / Sentimento:\n\t %s"%lista_feature_fell)

              #  features = get_feature_list(self.opcao)
              #  print("\n\tFeatureListtt: %s "%(features))
           
                frase = self.text_area.get_text()
                frase_Normal = normalizar(frase)
                features_msg = getFeatureVector(frase_Normal)

                print ("\n\tFrase analisada: %s "%frase)
                print ("\n\tCaracteristicas da Msg - %s\n "%features_msg)
                
                # Obter sentimentos associados
                show_relacao(obter_sentimento_associado(lista_feature_fell,features_msg))
                
                training_set = apply_features(extract_features,lista_feature_fell)
                self.fell = avaliar_Sentimento(features_msg,training_set)
                
                language = detect_language(frase)
                #print ("\n\tLingua: %s "%language)        
                if ( language == 'portuguese'):
                    #print ("\n\tSentimento: %s "%self.fell)
                    # text color
                    self.text_area.modify_text(gtk.STATE_NORMAL, gtk.gdk.color_parse("#FF0000"))
                    self.text_area2.modify_text(gtk.STATE_NORMAL, gtk.gdk.color_parse("#FF0000"))
                    # text font
                    # entry.modify_font(pango.FontDescription("monospace 16"))
                    self.text_area2.modify_font(pango.FontDescription("sans bold  16"))
                    if ( self.fell == 'negativo'):
                        frase = self.text_area.get_text()
                        self.text_area.set_text(frase)
                        self.text_area2.set_text("			 Sentimento: Negativo")
                        self.image_black.set_from_file('glade/imgs/cry.png')
                        self.image_black.show

                        self.image_doubtRight = self.builder.get_object("doubtRight")
                        self.image_doubtRight.set_from_file('glade/imgs/black.png')
                        self.image_doubtRight.show()
                        self.image_doubtLeft = self.builder.get_object("doubtLeft")
                        self.image_doubtLeft.set_from_file('glade/imgs/black.png')
                        self.image_doubtLeft.show()
                     
                    else:
                        self.text_area2.set_text("			 Sentimento: Positivo")
                        self.image_black.set_from_file('glade/imgs/happy.png')
                        self.image_black.show
                        
                        self.image_doubtRight = self.builder.get_object("doubtRight")
                        self.image_doubtRight.set_from_file('glade/imgs/black.png')
                        self.image_doubtRight.show()
                        self.image_doubtLeft = self.builder.get_object("doubtLeft")
                        self.image_doubtLeft.set_from_file('glade/imgs/black.png')
                        self.image_doubtLeft.show()
                        
                else:
                    self.text_area2.set_text(" Ce ta de Brincaxxon comigo ?????")     
コード例 #19
0
ファイル: train.py プロジェクト: byouloh/nosy
	def _get_trainingset(cls, source = 'db'):
		data = cls._get_data(source)
		cls._get_word_freq(data)
		return apply_features(cls.feature_extractor, data)
コード例 #20
0
ファイル: views.py プロジェクト: m1ck/hottrends
def nltkshow(request):
    
    def get_words_in_tweets(tweets):
        all_words = []
        for (words, sentiment) in tweets:
          all_words.extend(words)
        return all_words
        
    def get_word_features(wordlist):
        wordlist = FreqDist(wordlist)
        word_features = wordlist.keys()
        return word_features

     
    pos_tweets=[('I love this car','positive'), 
    ('This view is amazing','positive'),
    ('I feel great this morning','positive'),
    ('I am so excited about the concert','positive'),
    ('He is my best friend','positive')]

    neg_tweets=[('I do not like this car','negative'),
    ('This view is horrible','negative'),
    ('I feel tired this morning','negative'),
    ('I am not looking forward to the concert','negative'),
    ('He is my enemy','negative')]
    
    tweets=[]
    for(words,sentiment)in pos_tweets+neg_tweets:
      words_filtered=[e.lower() for e in words.split() if len(e)>=3]
      tweets.append((words_filtered,sentiment))

    test_pos_tweets=[('I feel happy this morning','positive'), 
    ('Larry is my friend','positive')]

    test_neg_tweets=[('I do not like that man','negative'),
    ('This view is horrible','negative'),
    ('The house is not great','negative'),
    ('Your song is annoying','negative')]

    test_tweets=[]
    for(test_words,test_sentiment)in test_pos_tweets+test_neg_tweets:
      test_words_filtered=[e.lower() for e in test_words.split() if len(e)>=3]
      test_tweets.append((test_words_filtered,test_sentiment))
      
       
    word_features = get_word_features(get_words_in_tweets(tweets))

    def extract_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
          features['contains(%s)' % word] = (word in document_words)
        return features   
        
    training_set = apply_features(extract_features, tweets)

    test_training_set=apply_features(extract_features, test_tweets)
     
    classifier = nltk.classify.NaiveBayesClassifier.train(training_set)
    
    tweet = 'Your song is horrible'
    clas= classifier.classify(extract_features(tweet.split()))
    '''  
    classifier.show_most_informative_features(5)

    + clas +"    "+ class2
    class2 nltk.classify.util.accuracy(classifier,test_training_set)
    '''
    now = datetime.datetime.now()
    html = clas + "<html><body>It is 555 now %s.</body></html>" % now
    
    return HttpResponse(html)
コード例 #21
0
        features["has(%s)" % letter] = (letter in name.lower())
    return features


names = ([(name, 'male') for name in names.words('male.txt')] +
         [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)

# feature_sets = [(gender_features(n), g) for (n, g) in names]
# train_set, test_set = feature_sets[500:], feature_sets[:500]

train_names = names[1500:]
dev_names = names[500:1500]
test_names = names[:500]

train_set = apply_features(gender_features, train_names)
dev_set = apply_features(gender_features, dev_names)
test_set = apply_features(gender_features, test_names)

classifier = nltk.NaiveBayesClassifier.train(train_set)

print classifier.classify(gender_features('Neo'))
print classifier.classify(gender_features('Miya'))
print nltk.classify.accuracy(classifier, test_set)

errors = []
for (name, tag) in dev_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))
コード例 #22
0
def nltkshow(request):
    def get_words_in_tweets(tweets):
        all_words = []
        for (words, sentiment) in tweets:
            all_words.extend(words)
        return all_words

    def get_word_features(wordlist):
        wordlist = FreqDist(wordlist)
        word_features = wordlist.keys()
        return word_features

    pos_tweets = [('I love this car', 'positive'),
                  ('This view is amazing', 'positive'),
                  ('I feel great this morning', 'positive'),
                  ('I am so excited about the concert', 'positive'),
                  ('He is my best friend', 'positive')]

    neg_tweets = [('I do not like this car', 'negative'),
                  ('This view is horrible', 'negative'),
                  ('I feel tired this morning', 'negative'),
                  ('I am not looking forward to the concert', 'negative'),
                  ('He is my enemy', 'negative')]

    tweets = []
    for (words, sentiment) in pos_tweets + neg_tweets:
        words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
        tweets.append((words_filtered, sentiment))

    test_pos_tweets = [('I feel happy this morning', 'positive'),
                       ('Larry is my friend', 'positive')]

    test_neg_tweets = [('I do not like that man', 'negative'),
                       ('This view is horrible', 'negative'),
                       ('The house is not great', 'negative'),
                       ('Your song is annoying', 'negative')]

    test_tweets = []
    for (test_words, test_sentiment) in test_pos_tweets + test_neg_tweets:
        test_words_filtered = [
            e.lower() for e in test_words.split() if len(e) >= 3
        ]
        test_tweets.append((test_words_filtered, test_sentiment))

    word_features = get_word_features(get_words_in_tweets(tweets))

    def extract_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    training_set = apply_features(extract_features, tweets)

    test_training_set = apply_features(extract_features, test_tweets)

    classifier = nltk.classify.NaiveBayesClassifier.train(training_set)

    tweet = 'Your song is horrible'
    clas = classifier.classify(extract_features(tweet.split()))
    '''  
    classifier.show_most_informative_features(5)

    + clas +"    "+ class2
    class2 nltk.classify.util.accuracy(classifier,test_training_set)
    '''
    now = datetime.datetime.now()
    html = clas + "<html><body>It is 555 now %s.</body></html>" % now

    return HttpResponse(html)
コード例 #23
0
ファイル: tf_idf.py プロジェクト: echang36/turtledown
	#		word_pairs.append(word_pair)

	#gen = (word for word in words if word not in nltk.corpus.stopwords.words('english'))
	#for word in gen:
	#	features["contains_unigrams_(%s)" %(word)] = True

	#for word_pair in word_pairs:
	#	features["contains_bigrams_(%s)" % (word_pair)] = True

	return features



#Create proper test and training sets based on features 
print "Applying features to Training Set..."
train_set = apply_features(feature_extracting_function, train_data_points)
print "Applying features to Testing Set..."
test_set = apply_features(feature_extracting_function, test_data_points)


#Run the NLTK Naive Bayes Classifier on the training set 
print "Doing Naive Bayes...."
nb = nltk.NaiveBayesClassifier.train(train_set)

#NLTK Accuracy: Run trained model on the test set 
print "Accuracy: "+str(nltk.classify.accuracy(nb, test_set))

print "\n"+str(nb.show_most_informative_features(20))


コード例 #24
0
ファイル: classification.py プロジェクト: ski2per/archive
        features["has(%s)" % letter] = (letter in name.lower())
    return features


names = ([(name, 'male') for name in names.words('male.txt')] +
         [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)

# feature_sets = [(gender_features(n), g) for (n, g) in names]
# train_set, test_set = feature_sets[500:], feature_sets[:500]

train_names = names[1500:]
dev_names = names[500:1500]
test_names = names[:500]

train_set = apply_features(gender_features, train_names)
dev_set = apply_features(gender_features, dev_names)
test_set = apply_features(gender_features, test_names)

classifier = nltk.NaiveBayesClassifier.train(train_set)

print classifier.classify(gender_features('Neo'))
print classifier.classify(gender_features('Miya'))
print nltk.classify.accuracy(classifier, test_set)

errors = []
for (name, tag) in dev_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))
コード例 #25
0
 def classify(self, instance):
     instance_feats = apply_features(self.extract_features, [instance],
                                     labeled=False)
     return self.classifier.classify(instance_feats[0])
コード例 #26
0
        tweet = 'CS' + tweet

    tweets = mono_tweets + cs_tweets

    # this is only necessary if mono and cs are mixed
    random.shuffle(tweets)

    # Splitting into train and test, FOR 1000 TWEETS (CHANGE IT ACCORDINGLY)
    train = tweets[:900]
    print(train[:3])
    test = tweets[900:]
    print(test[:3])
    print(len(test))

    # Applying features to our data.
    train_feat = apply_features(senti_features2, train)
    print(train_feat[:3])
    test_feat = apply_features(senti_features2, test)

    print('Training...')

    # Training the classifier
    me = MaxentClassifier.train(train_feat, max_iter=10)

    print('Evaluating...')

    print(evaluate_classifier(me, test_feat))

    print('Best features:')

    me.show_most_informative_features()
コード例 #27
0
for u in jb_users:
    curr = db.query("SELECT tweet FROM tweets WHERE user=%s", u["id"])
    curr = [hash.sub("", mtn.sub("", t['tweet'])).strip() for t in curr]
    [jb_tweets.append((t, "JB")) for t in curr]

print "Singapore", len(sg_tweets)
print "Johor", len(jb_tweets)
db.close()

#NLTK processing
tweets = []

random.shuffle(sg_tweets)
random.shuffle(jb_tweets)

tweets = filter_words(sg_tweets[3500:] + jb_tweets[1100:])
test = filter_words(sg_tweets[-220:] + jb_tweets[-220:])


word_features = get_word_features(get_words_in_tweets(tweets))[:2000]
training_set = apply_features(extract_features, tweets)
test_set = apply_features(extract_features, test)

#Train using nltk
classifier = nltk.NaiveBayesClassifier.train(training_set)

print classifier.show_most_informative_features(32)
print "Accuracy: ", nltk.classify.accuracy(classifier, test_set)

コード例 #28
0
      #+ bcolors.ENDC)

        lista_feature_fell = get_lista_feature_fell()
        #print("\n\tCaracteristica / Sentimento:\n\t %s"%lista_feature_fell)
        tema = listaColecao
        msg=sys.argv[1]
        language = detect_language(msg)
        ##print ("\n\tLingua: %s "%language)        
        if ( language == 'portuguese'):
            print("\n\tAnalisar Msg: %s "%msg.capitalize())
            msg2 = normalizar(msg)
            print("\n\tNormalizado: %s "%msg2)
            features_msg=getFeatureVector(msg2)
            print ("\n\tCaracteristicas da Msg - %s "%features_msg)

            relacao = obter_sentimento_associado(lista_feature_fell,features_msg)
            #print("\n\tSentimentos associados : %s "%sentimentos_associados)
            valor = show_relacao(relacao)
            if(valor == 'true'):
                training_set = apply_features(extract_features,lista_feature_fell)
                ##print("\n\tTraining_set:  %s "%training_set)
                # Avalia mensagem
                avaliar_Sentimento(features_msg,training_set)
            else:
                print(bcolors.FAIL+"\n\tAvaliação Impossível\n\n"+bcolors.ENDC)
            
        else:
            print ("\n\tPor favor insira o texto novamente\n\n")
    else:
        print ('\nUsage: python testarMsg.py msg fatec|dilma|copa|palmeiras\n')
コード例 #29
0
def avaliar_Sentimento(message):
    training_set = apply_features(extract_features,lista_feature_fell)
    classifier= nltk.NaiveBayesClassifier.train(training_set)
コード例 #30
0
    return word_feature

def naive_bayes(training_set,test_set):

    # build the trained set and save it in naive_classifier.p
    naive_classifier = 'naive_classifier.p'
    if not os.path.exists(naive_classifier):
        classifier = nltk.NaiveBayesClassifier.train(training_set)
        fileobject = open(naive_classifier, 'wb')
        pickle.dump(classifier, fileobject)
        fileobject.close()
    # load the ctrained data set in the previous step.
    fileobject = open(naive_classifier, 'rb')
    classifier = pickle.load(fileobject)
    fileobject.close()

#######
    print("Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, test_set))*100)
    classifier.show_most_informative_features(15)


df = pd.read_csv("rating_sentiment3.csv")
dataset = featureExtraction(df)
all_words = bag_of_words(dataset)
word_freq = word_frequency(all_words)
words_features = get_words_features(word_freq)
processed_record = 0
training_set = apply_features(find_word_feature, dataset[:-750])
test_set = apply_features(find_word_feature, dataset[-750:])
naive_bayes(training_set,test_set)
コード例 #31
0
ファイル: choclate.py プロジェクト: tsaxena/DataMining
# Event
# Hackbright Data Science Workshop.

# Author
# Daniel Wiesenthal.  [email protected].

# What is this?
# This is a simple script illustrating the usage of the Python NLTK classifier.  It is written in Python, but the comments are intended to make it clear how to port to other languages.  The flow isn't particularly well decomposed as a program; rather, it is intended to go along linearly with the associated talk/presentation.
# The goal is to find out which chocolate a particular volunteer during the talk will like.  We have a few examples of chocolate bars that we know are either matches or not (misses), and want to use that to make a guess about an unknown bar (we don't know if it will be a match, and want to guess).

# Further reading:
# http://www.stanford.edu/class/cs124/lec/naivebayes.pdf
# http://nltk.googlecode.com/svn/trunk/doc/book/ch06.html

# Software Setup
# For this script to work, you'll need to have Python, NLTK (a Python package), and Numpy (upon which NLTK depends) installed.  On a Mac (which all have numpy pre-installed these days), run:
# sudo easy_install pip
# sudo pip install nltk
# <cd to directory with this file>
# python classification_101.py

#Required libraries
from myclassifier import MyClassifier
import pprint
try:
    import nltk
    from nltk.classify.util import apply_features
    import string
    print "Great!  Looks like you're all set re: NLTK and Python."
except Exception, e:
コード例 #32
0
def twitterClass():
    global wordFeatures
    tknzr = TweetTokenizer(strip_handles=True)
    onlyWords = re.compile('^[a-zA-Z]+$')
    # print
    if not os.path.exists(os.path.join(os.getcwd(), 'semtiment_classifier.pickle')):
        print twitter_samples.fileids()
        # print movie_reviews.fileids()
        # print

        tknzr = TweetTokenizer(strip_handles=True)
        onlyWords = re.compile('^[a-zA-Z]+$')
        labeledTweets = []

        for it in twitter_samples.docs('negative_tweets.json'):
            tokens = []
            for token in tknzr.tokenize(it['text']):
                if onlyWords.match(token) is not None:
                    tokens.append(token.lower())
            labeledTweets.append((tokens, "negative"))
            # print [token for token in tknzr.tokenize(it['text']) if onlyWords.match(token) is not None]

        for it in twitter_samples.docs('positive_tweets.json'):
            tokens = []
            for token in tknzr.tokenize(it['text']):
                if onlyWords.match(token) is not None:
                    tokens.append(token.lower())
            labeledTweets.append((tokens, "positive"))

        # print  labeledTweets
        wordFeatures = get_word_features(get_words_in_tweets(labeledTweets))
        print "training"
        training = classUtil.apply_features(extract_features, labeledTweets)
        # print training

        sentimentClassifier = NaiveBayesClassifier.train(training)
        print "done training"
        f = open('semtiment_classifier.pickle', 'wb')
        pickle.dump(sentimentClassifier, f)
        f.close()
    else:
        fin = open('wordFeatures.json', "r")
        wordFeatures = json.load(fin)
        fin.close()
        print wordFeatures
        f = open('semtiment_classifier.pickle', 'rb')
        classifier = pickle.load(f)  # type: nltk.classify.naivebayes.NaiveBayesClassifier
        f.close()
        # text,created_at
        tweets = []

        onlyWords = re.compile('^[a-zA-Z]+$')
        labeledTweets = []
        for row in csv.DictReader(open('datafiles/trump.csv')):
            text = row['text']
            features = []
            for token in tknzr.tokenize(text):
                if onlyWords.match(token) is not None:
                    features.append(token.lower())
            print row['created_at']
            tweets.append({
                "created_at": row['created_at'],
                "text": text,
                "classification": classifier.classify(extract_features(features))
            })
        classification = open('trumpClassified.json', 'w+')
        classification.write(json.dumps(tweets, indent=2))
        classification.close()
        tweets = []
        labeledTweets = []
        for row in csv.DictReader(open('datafiles/clinton.csv')):
            text = row['text']
            features = []
            for token in tknzr.tokenize(text):
                if onlyWords.match(token) is not None:
                    features.append(token.lower())
            print row['created_at']
            tweets.append({
                "created_at": row['created_at'],
                "text": text,
                "classification": classifier.classify(extract_features(features))
            })
        classification = open('clintonClassified.json', 'w+')
        classification.write(json.dumps(tweets, indent=2))
        classification.close()
コード例 #33
0
test_tweets = []
for (test_words, test_sentiment) in test_pos_tweets + test_neg_tweets:
    test_words_filtered = [
        e.lower() for e in test_words.split() if len(e) >= 3
    ]
    test_tweets.append((test_words_filtered, test_sentiment))

word_features = get_word_features(get_words_in_tweets(tweets))


def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features


training_set = apply_features(extract_features, tweets)

test_training_set = apply_features(extract_features, test_tweets)

classifier = nltk.classify.NaiveBayesClassifier.train(training_set)

tweet = 'Your song is annoying'
print classifier.classify(extract_features(tweet.split()))

classifier.show_most_informative_features(5)

print nltk.classify.util.accuracy(classifier, test_training_set)
コード例 #34
0

test_reviews=[]
for(test_words,test_sentiment)in test:
  test_words_filtered=[e.lower() for e in test_words.split() if len(e)>=3]
  test_reviews.append((test_words_filtered,test_sentiment))
  
    
word_features = get_word_features(get_words_in_reviews(reviews))

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
      features['contains(%s)' % word] = (word in document_words)
    return features       
                       
    
training_set = apply_features(extract_features, reviews)

test_training_set=apply_features(extract_features, test_reviews)

classifier = nltk.classify.NaiveBayesClassifier.train(training_set)

tweet = 'she is awesome'
print(classifier.classify(extract_features(tweet.split())))

print(nltk.classify.util.accuracy(classifier,test_training_set))
classifier.show_most_informative_features(40)

コード例 #35
0
        #    print ("Sentimento - %s "%x)
        i+=1
    while ( y < len(listaMsg)):    
        features = getAllFeatures(listaMsg[y],features) # Todas palavras relevantes/caracteristicas da mensagem
        featureVector = getFeatureVector(listaMsg[y])
        lista_feature_fell.append((featureVector,listaFell[y]))
        y+=1
    return features

def get_lista_feature_fell():
    return lista_feature_fell

def get_feature_list():
    return featureList

if __name__ == '__main__':
    if (len(sys.argv) == 2 and (sys.argv[1] == 'fatec' or sys.argv[1] == 'dilma' or sys.argv[1] == 'copa' or sys.argv[1] == 'palmeiras')):
        # Limpar a tela
        #subprocess.call("clear")
        listaColecao = sys.argv[1]
        print ("\n\t\tAnálise de Sentimento\n\t\tAssunto: %s "%listaColecao.upper())
        # Gera a lista de caracteristicas usada no metodo extract_features
        featureList = gera_lista_features(listaColecao)
        print ("\n\tCaracteristicas:  %s\n "%(featureList))
        lista_feature_fell=get_lista_feature_fell()
        #print ("\n\tCaracteristicas e Sentimento:  %s "%lista_feature_fell)
        training_set = apply_features(check_features,lista_feature_fell)
        print ("\n\tTraining_set: %s \n"%training_set)
    else:
        print ('\nUsage: python trainingSet.py fatec|dilma|copa|palmeiras\n')
コード例 #36
0
ファイル: train.py プロジェクト: sushengyang/nlp-proposal
    return word_features


review_features = get_features(get_all_words(reviews))
# print review_features


def extract_features(document):
    document_words = set(document)
    features = {}
    for word in review_features:
        features["contains(%s)" % word] = word in document_words
    # print features
    return features


training_set = apply_features(extract_features, reviews)
classifier = nltk.classify.NaiveBayesClassifier.train(training_set)


# test = ["berbat bir yer", "Muhteşem bir yer.","harika","mükemmel bir yer","vasat","rezalet","Başarılı","Kötü yemekler"]

for r in test:
    print classifier.classify(extract_features(r.split()))


test_set = apply_features(extract_features, test_reviews)

print nltk.metrics.scores.accuracy(classifier, test_set)
# print classifier.show_most_informative_features(1)