Beispiel #1
0
def create_and_train():
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]
    
    positive_cleaned_tokens_list = tokenize('positive_tweets.json')
    negative_cleaned_tokens_list = tokenize('negative_tweets.json')

    all_pos_words = get_all_words(positive_cleaned_tokens_list)
    freq_dist_pos = FreqDist(all_pos_words)

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset
    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    classifier = NaiveBayesClassifier.train(train_data)

    return classifier
Beispiel #2
0
def trainmodel():
    pos_tweets = twitter_samples.strings('positive_tweets.json')
    neg_tweets = twitter_samples.strings('negative_tweets.json')
    random.seed(1)  
    pos_tweets_set = []
    print("Training Phase")
    for tweet in pos_tweets:
       tweet = process_data(cleanText(tweet))
       pos_tweets_set.append((tweet, 'pos'))
    print("POS added in positive tweets")   
    neg_tweets_set = []
    for tweet in neg_tweets:
       tweet = process_data(cleanText(tweet))
       neg_tweets_set.append((tweet, 'neg'))
    print("NEG added in negative tweets") 
    shuffle(pos_tweets_set)
    shuffle(neg_tweets_set)
    #test_set = pos_tweets_set[:100] + neg_tweets_set[:100]
    train_set = pos_tweets_set[100:2000] + neg_tweets_set[100:2000]
    print("Training started by naive bayes classifier")
    classifier = NaiveBayesClassifier(train_set)
    print("Training finished")
    # accuracy = classifier.accuracy(test_set)
    # print(accuracy*100)
    #print("Server started with model accuracy:"+str(accuracy*100))
    return classifier
def fetch_twitter_samples():

    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')

    return positive_tweets, negative_tweets, text
Beispiel #4
0
 def twitter_data_training(self):
     nagative_tweets = twitter_samples.strings('negative_tweets.json')
     positive_tweets = twitter_samples.strings('positive_tweets.json')
     feature_set = []
     i=0
     for tweets in positive_tweets:
         i+=1
         print "twitterpos%s"%i
         if i == 2000:
             break
         words = self.clean_words(wordpunct_tokenize(tweets))
         feature_set.append((self.create_feature_set(words)[0], 'pos'))
         feature_set.append((self.create_feature_set(words)[1], 'neg'))
     i=0
     for tweets in nagative_tweets:
         i+=1
         print "twitterneg%s"%i
         if i == 2000:
             break
         words = self.clean_words(wordpunct_tokenize(tweets))
         feature_set.append((self.create_feature_set(words)[0], 'neg'))
         feature_set.append((self.create_feature_set(words)[1], 'pos'))
     random.shuffle(feature_set)
     training_set = feature_set[:8000]
     return training_set
def SentimentML(df):
    all_positive_tweets=twitter_samples.strings('positive_tweets.json')
    all_negative_tweets=twitter_samples.strings('negative_tweets.json')
    test_pos=all_positive_tweets[4000:]
    train_pos=all_positive_tweets[:4000]
    test_neg=all_negative_tweets[4000:]
    train_neg=all_negative_tweets[:4000]
    train_x=train_pos+train_neg
    test_x=test_pos+test_neg
    train_y=np.append(np.ones((len(train_pos),1)),np.zeros((len(train_neg),1)),axis=0)
    test_y=np.append(np.ones((len(test_pos),1)),np.zeros((len(test_neg),1)),axis=0)
    freqs=build_freqs(train_x,train_y)
    X=np.zeros((len(train_x),3))
    for i in range(len(train_x)):
        X[i,:]= extract_features(train_x[i],freqs)
    Y=train_y
    J,theta=gradientDescent(X,Y,np.zeros((3,1)),1e-9,1500)
    tmp_accuracy=test_logistic_regression(test_x,test_y,freqs,theta)
    sentiment_l=[]
    l=df['totaltext'].tolist()
    for i in l:
        y_hat=predict_tweet(i,freqs,theta)
        if y_hat>0.5:
            sentiment_l.append('Positive sentiment')
        else:
            sentiment_l.append('Negative sentiment')
    df.insert(6,'sentiment_description',sentiment_l)
    return df
Beispiel #6
0
def lemm(words):
    lemmatizer = WordNetLemmatizer()

    words_lemm = []
    for word in words:
        words_lemm.append(lemmatizer.lemmatize(word))

    text_tagged = pos_tag(words_lemm)
    #print (text_tagged)

    stopset = set(stopwords.words('english')) - set(
        ('over', 'under', 'below', 'more', 'most', 'no', 'not', 'only', 'such',
         'few', 'so', 'too', 'very', 'just', 'any', 'once'))
    from nltk.corpus import twitter_samples
    print(stopset)

    pos_tweets = twitter_samples.strings('positive_tweets.json')
    print(pos_tweets)

    neg_tweets = twitter_samples.strings('negative_tweets.json')
    print(neg_tweets)

    if (len(pos_tweets) == len(neg_tweets)):
        print("Same length")
    else:
        print("Different lengths")
Beispiel #7
0
 def __init__(self):
     pos = twitter_samples.strings('positive_tweets.json')
     neg = twitter_samples.strings('negative_tweets.json')
     self.x = pos + neg
     self.y = np.append(np.ones(len(pos)), np.zeros(len(neg)))
     self.freqs = self.count_tweets()
     self.logprior, self.loglikelihood = self.NB_train()
Beispiel #8
0
def trainRandomForest():

    stop_words = stopwords.words('english')
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')

    X = positive_tweets + negative_tweets
    positives = np.ones([len(positive_tweets), 1])
    negatives = np.zeros([len(negative_tweets), 1])
    y = np.concatenate([positives, negatives])

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1,
                                                        shuffle=True)

    pipe = Pipeline([
        ('tokenize', FunctionTransformer(tokenizeIt)),
        ('noise', FunctionTransformer(removeIt)),
        ('tfidf', TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7)),
        ('classifier', RandomForestClassifier(n_estimators=100,
                                              random_state=1))
    ])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(accuracy_score(y_test, y_pred))

    with open('trainedpipe.pkl', 'wb') as f:
        pickle.dump(pipe, f)
Beispiel #9
0
def deal_trainset(stop_words, english_punctuations):
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for pos_sen in positive_tweets:
        positive_cleaned_tokens_list.append(
            Cleaner(Tokenization(pos_sen), stop_words, english_punctuations))
    for neg_sen in negative_tweets:
        negative_cleaned_tokens_list.append(
            Cleaner(Tokenization(neg_sen), stop_words, english_punctuations))

    print(positive_cleaned_tokens_list)

    pos_model = []
    for pos_sen in positive_cleaned_tokens_list:
        pos_model.append(dict([word, True] for word in pos_sen))
    # print(pos_model)

    pos_dataset = [(pos_dict, "Positive") for pos_dict in pos_model]

    # print(pos_dataset)

    neg_model = []
    for neg_sen in negative_cleaned_tokens_list:
        neg_model.append(dict([word, True] for word in neg_sen))
    neg_dataset = [(neg_dict, "Negative") for neg_dict in neg_model]
    return pos_dataset, neg_dataset
Beispiel #10
0
    def trainNaiveBayesClassifier(self):

        from nltk.corpus import twitter_samples

        pos_tweets = twitter_samples.strings('positive_tweets.json')
        neg_tweets = twitter_samples.strings('negative_tweets.json')

        # positive tweets feature set
        pos_tweets_set = []
        for tweet in pos_tweets:
            pos_tweets_set.append((self.bag_of_words(tweet), 'pos'))

        # negative tweets feature set
        neg_tweets_set = []
        for tweet in neg_tweets:
            neg_tweets_set.append((self.bag_of_words(tweet), 'neg'))

        # radomize pos_reviews_set and neg_reviews_set
        # doing so will output different accuracy result everytime we run the program
        shuffle(pos_tweets_set)
        shuffle(neg_tweets_set)

        test_set = pos_tweets_set[:1000] + neg_tweets_set[:1000]
        train_set = pos_tweets_set[1000:] + neg_tweets_set[1000:]

        self.classifier = NaiveBayesClassifier.train(train_set)
Beispiel #11
0
    def preprocess_data(self):
        self.pos_tweets = twitter_samples.strings('positive_tweets.json')
        self.neg_tweets = twitter_samples.strings('negative_tweets.json')
        self.text = twitter_samples.strings('tweets.20150430-223406.json')
        self.pos_tweet_tokens = twitter_samples.tokenized(
            'positive_tweets.json')
        self.neg_tweet_tokens = twitter_samples.tokenized(
            'negative_tweets.json')
        self.pos_cleaned_tokens_list = [
            self.remove_noise(tokens) for tokens in self.pos_tweet_tokens
        ]
        self.neg_cleaned_tokens_list = [
            self.remove_noise(tokens) for tokens in self.neg_tweet_tokens
        ]
        self.all_pos_words = self.get_all_words(self.pos_cleaned_tokens_list)
        self.all_neg_words = self.get_all_words(self.neg_cleaned_tokens_list)
        self.freq_dist_pos = FreqDist(self.all_pos_words)
        self.freq_dist_neg = FreqDist(self.all_neg_words)
        self.pos_tokens_for_model = self.get_tweets_for_model(
            self.pos_cleaned_tokens_list)
        self.neg_tokens_for_model = self.get_tweets_for_model(
            self.neg_cleaned_tokens_list)

        self.pos_dataset = [(tweet_dict, "Positive")
                            for tweet_dict in self.pos_tokens_for_model]
        self.neg_dataset = [(tweet_dict, "Negative")
                            for tweet_dict in self.neg_tokens_for_model]
        self.dataset = self.pos_dataset + self.neg_dataset
        random.shuffle(self.dataset)
        mid = len(self.dataset) // 2
        self.train_data = self.dataset[:mid]
        self.test_data = self.dataset[mid:]
Beispiel #12
0
def bayes_classifier(filename): #Function for the bayes classifier with add 1 smoothing for the specified file of tweets
    output_file = open('shooting.txt', 'w', encoding='utf-8')
    num_pos_predictions = 0
    num_neg_predictions = 0

    k = (pos_fd + neg_fd).B() #total number of bins
    pos_count = len(twitter_samples.strings('positive_tweets.json')) #get the number of positive tokens
    neg_count = len(twitter_samples.strings('negative_tweets.json'))  #number of negative tokens
    log_prior_pos = math.log(pos_count / (pos_count + neg_count))       #logs for very small probabilities
    log_prior_neg = math.log(neg_count / (pos_count + neg_count))

    tweets = []
    with open(filename) as f: #open the specified json file containing tweets and load each line as a seperate tweet
        for line in f:
            tweets.append(json.loads(line))

    for tweet in tweets:  # for each tweet perform the sentiment analysis
        total_log_prob_pos = log_prior_pos
        total_log_prob_neg = log_prior_neg
        tokens = tweet_tokenizer.tokenize(tweet["text"]) #get the raw tweet text from each tweet

        for token in tokens:
            total_log_prob_neg += math.log((neg_fd[token] + 1) / neg_fd.N() + k) #bayes formula for pos/neg probability
            total_log_prob_pos += math.log((pos_fd[token] + 1) / pos_fd.N() + k)

        if total_log_prob_pos > total_log_prob_neg: #if it is more likely to be positive
            num_pos_predictions += 1
            print('pos', file=output_file)      #record to output file
        else:
            num_neg_predictions += 1
            print('neg', file=output_file)

    print('\nnumber of positive tweets: ', num_pos_predictions, file=output_file)
    print('number of negative tweets: ', num_neg_predictions, file=output_file)
Beispiel #13
0
def driver():

    # String varaiables of the dataset
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []
    text = twitter_samples.strings('tweets.20150430-223406.json')
    stop_words = stopwords.words('english')

    # Tokenized variables of dataset
    # tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    # Cleaning the noise in the tweet tokens
    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    # Frequency distribution for cleaned words
    all_pos_words = get_all_words(positive_cleaned_tokens_list)
    freq_dist_pos = FreqDist(all_pos_words)

    # Creating positive and negative dictionaries
    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    # Creating the final dataset for training
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    # Shuffling for suedo-randomness and avoid bias
    random.shuffle(dataset)

    # Dividing shuffled data into train and test data
    train_data = dataset[:7000]
    test_data = dataset[7000:]

    # Initializing the classifier
    classifier = NaiveBayesClassifier.train(train_data)

    custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

    custom_tokens = remove_noise(word_tokenize(custom_tweet))

    print(custom_tweet, classifier.classify(
        dict([token, True] for token in custom_tokens)))
def main_code(analysis_input):

    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    # print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    classifier = NaiveBayesClassifier.train(train_data)

    # print("Accuracy is:", classify.accuracy(classifier, test_data))

    # print(classifier.show_most_informative_features(10))

    custom_tokens = remove_noise(word_tokenize(analysis_input))

    pos_or_neg = str(
        classifier.classify(dict([token, True] for token in custom_tokens)))

    return pos_or_neg
Beispiel #15
0
def train_model():
    """
    Trains a Naive Bayes sentiment classifier using the twitter_samples
    dataset from NLTK. Each tweet is tokenized and cleaned to produce a training
    dataset for the machine learning model.
    Parameters
    ----------
    Returns
    -------
    NaiveBayesClassifier
    """
    #Load dataset from nltk data
    positive_tweets = twitter_samples.strings("positive_tweets.json")
    negative_tweets = twitter_samples.strings("negative_tweets.json")

    #Retrieve english stop words
    stop_words = stopwords.words("english")

    #Tweet tokenization
    positive_tweet_tokens = twitter_samples.tokenized("positive_tweets.json")
    negative_tweet_tokens = twitter_samples.tokenized("negative_tweets.json")

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    #Token cleaning
    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    #Extract words from tokens
    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    #Frequency distribition of words
    freq_dist_pos = FreqDist(all_pos_words)

    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    #Create datasets
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]
    #Merge individual datasets into singular training data
    dataset = positive_dataset + negative_dataset

    train_data = dataset

    classifier = NaiveBayesClassifier.train(train_data)
    return classifier
Beispiel #16
0
def all_data():
    if os.path.exists(r'C:\Users\baiyang01\AppData\Roaming\nltk_data\corpora\twitter_samples'):
        print('Files already exists.')
        pass
    else:
        nltk.download('twitter_samples')  # select the set of positive and negative tweets
        print("I'm downloading the file")
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    return positive_tweets, negative_tweets
def calibrate():
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized(
        'positive_tweets.json'
    )  #files downloaded from setup.py used to calibrate the classifer for sentiment analysis
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    #print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    positive_dataset = [
        (tweet_dict, "Positive")  #calibrating positive
        for tweet_dict in positive_tokens_for_model
    ]

    negative_dataset = [
        (tweet_dict, "Negative")  #calibrating negative
        for tweet_dict in negative_tokens_for_model
    ]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]
    global classifier

    classifier = NaiveBayesClassifier.train(train_data)  #trains the data!
    print("Calibration complete!")

    print("Accuracy is:", classify.accuracy(classifier, test_data))
    def __get_labeled_tweets(self):
        """
        Get labeled tweets from nltk
        :return: cleaned list of lists that contain tokens for each tweets
        """
        pos_samples = twitter_samples.strings(
            'positive_tweets.json')[:self.__sample_size]
        neg_samples = twitter_samples.strings(
            'negative_tweets.json')[:self.__sample_size]
        # print(pos_samples+neg_samples) # show raw tweets sample form nltk

        return self.algorithm.process_tweets(pos_samples + neg_samples)
Beispiel #19
0
def upload():
    pos_tweets = twitter_samples.strings('positive_tweets.json')
    neg_tweets = twitter_samples.strings('negative_tweets.json')
    all_tweets = twitter_samples.strings('tweets.20150430-223406.json')
    for tweet in pos_tweets:
        pos_tweets_set.append((bag_of_words(tweet), 'pos'))
    for tweet in neg_tweets:
        neg_tweets_set.append((bag_of_words(tweet), 'neg'))
    text.delete('1.0', END)
    text.insert(
        END, "NLTK Total No Of Tweets Found : " +
        str(len(pos_tweets_set) + len(neg_tweets_set)) + "\n")
def load_data():
    nltk.download('twitter_samples')
    nltk.download('stopwords')

    all_positive_tweets = twitter_samples.strings('positive_tweets.json')
    all_negative_tweets = twitter_samples.strings('negative_tweets.json')

    X = all_positive_tweets + all_negative_tweets
    y = np.append(np.ones((len(all_positive_tweets), 1)),
                  np.zeros((len(all_negative_tweets), 1)),
                  axis=0)
    return X, y
Beispiel #21
0
 def test_corpus_twitter_method_returns_correct_result(self):
     self.assertEqual(twitter_samples.fileids(), [
         'negative_tweets.json', 'positive_tweets.json',
         'tweets.20150430-223406.json'
     ])
     self.assertEqual(
         twitter_samples.strings('negative_tweets.json')[0],
         'hopeless for tmr :(')
     self.assertEqual(
         twitter_samples.strings('positive_tweets.json')[0],
         '#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)'
     )
Beispiel #22
0
def train_social():
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    # tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]
    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                         for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                         for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    # Classifier - TODO Add persistence
    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))

    print(classifier.show_most_informative_features(100))

    custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

    custom_tokens = remove_noise(word_tokenize(custom_tweet))

    print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens)))
    return classifier
Beispiel #23
0
def train():
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))

    print(classifier.show_most_informative_features(10))

    f = open('classifier.pickle', 'wb')
    pickle.dump(classifier, f)
    f.close()
Beispiel #24
0
    def __init__(self):
        nltk.download('twitter_samples')
        nltk.download('stopwords')
        nltk.download('wordnet')
        pos_tweets = twitter_samples.strings('positive_tweets.json')
        neg_tweets = twitter_samples.strings('negative_tweets.json')

        self.stopwords_english = stopwords.words('english')

        self.lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
        self.classifier = self.generateModel(pos_tweets, neg_tweets)
        with open('melbourne_suburbs.geojson') as f:
            self.geoJs = json.load(f)
Beispiel #25
0
def ml_model():
    pos_tweets = twitter_samples.strings('positive_tweets.json')
    pos_tweets = vocab_gen(pos_tweets, 'pos')
    neg_tweets = twitter_samples.strings('negative_tweets.json')
    neg_tweets = vocab_gen(neg_tweets, 'neg')
    neu_tweets = twitter_samples.strings('neutral_tweets.json')
    neu_tweets = vocab_gen(neu_tweets, 'neu')
    test_set = pos_tweets[:1000] + neg_tweets[:1000] + neu_tweets[:1000]
    train_set = pos_tweets[1000:] + neg_tweets[1000:] + neu_tweets[1000:]
    classifier = NaiveBayesClassifier.train(train_set)
    accuracy = classify.accuracy(classifier, test_set)
    print(accuracy)
    joblib.dump(classifier, 'twitter_sent.pkl')
Beispiel #26
0
def train_twtr_classifier():
    if os.path.isfile(SAVED_CLSR_LOC):
        return load_classifier()

    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    pos_twt_toks = twitter_samples.tokenized('positive_tweets.json')
    neg_twt_toks = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = [
        remove_noise(toks, stop_words) for toks in pos_twt_toks
    ]
    negative_cleaned_tokens_list = [
        remove_noise(toks, stop_words) for toks in neg_twt_toks
    ]

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))

    save_classifier(classifier)

    return classifier
Beispiel #27
0
    def __init__(self):
        """
        Gather data
        """
        positive = twitter_samples.strings('positive_tweets.json')
        negative = twitter_samples.strings('negative_tweets.json')
        self.stop_words = list(set(stopwords.words('english')))

        positive_tokens = twitter_samples.tokenized('positive_tweets.json')
        negative_tokens = twitter_samples.tokenized('negative_tweets.json')
        """
        Clean the data
        """
        positive_clean = []
        negative_clean = []

        for token in positive_tokens:
            positive_clean.append(self.clean(token))

        for token in negative_tokens:
            negative_clean.append(self.clean(token))

        positive_model_tokens = self.final_token_generator(positive_clean)
        negative_model_tokens = self.final_token_generator(negative_clean)
        """
        Use generator to make datasets
        """
        positive_dataset = [(token, "Positive")
                            for token in positive_model_tokens]

        negative_dataset = [(token, "Negative")
                            for token in negative_model_tokens]

        dataset = positive_dataset + negative_dataset
        """
        Shake it all about
        """
        random.shuffle(dataset)
        random.shuffle(dataset)
        random.shuffle(dataset)
        """
        Split them up
        """
        training = dataset[:7000]
        testing = dataset[7000:]
        """
        Train the classifier
        """
        self.classifier = NaiveBayesClassifier.train(training)
        """
Beispiel #28
0
    def __init__(self):
        pos_tweets = [(x, 'Positive') for x in twitter_samples.strings('positive_tweets.json')]
        neg_tweets = [(x, 'Negative') for x in twitter_samples.strings('negative_tweets.json')]

        full_dataset = pos_tweets + neg_tweets
        random.shuffle(full_dataset)
        dataset_size = len(full_dataset)
        
        # Have to divide the dataset. Larger datasets can result in a SIGKILL 
        # Probably due to limited memory in the docker container(needs further investigation)
        train_size = dataset_size//5
        train_dataset = full_dataset[:train_size]
        
        self.nb_classifier = NaiveBayesClassifier(train_dataset)
class preprocessing:

    # select the lists of positive andpro negative tweets
    all_positive_tweets = twitter_samples.strings('positive_tweets.json')
    all_negative_tweets = twitter_samples.strings('negative_tweets.json')

    # concatenate the lists, 1st part is the positive tweets followed by the negative
    tweets = all_positive_tweets + all_negative_tweets

    # make a numpy array representing labels of the tweets
    labels = np.append(np.ones((len(all_positive_tweets))), np.zeros((len(all_negative_tweets))))

    processed tweets = []
    for tweet in tweets: 
        processed tweets = process_tweet(tweet) # Preprocess a given tweet

    # create frequency dictionary
    freqs = build_freqs(processed tweets, labels)

    # list representing our table of word counts.
    # each element consist of a sublist with this pattern: [<word>, <positive_count>, <negative_count>]pro
    data = []

    # loop through our selected words
    for word in keys:

        # initialize positive and negative counts
        pos = 0
        neg = 0

        # retrieve number of positive counts
        if (word, 1) in freqs:
            pos = freqs[(word, 1)]

        # retrieve number of negative counts
        if (word, 0) in freqs:
            neg = freqs[(word, 0)]

        # append the word counts to the table
        data.append([word, pos, neg])

    def get_preprocessed_data():
        return processed tweets
    
    def get_freqs_dict():
        return freqs 
        
    def get_freqs_table():
        return data
Beispiel #30
0
def data_preprocess():
    # get the sets of positive and negative tweets
    all_positive_tweets = twitter_samples.strings('positive_tweets.json')
    all_negative_tweets = twitter_samples.strings('negative_tweets.json')
    # split the data into two pieces, one for training and one for testing (validation set)
    test_pos = all_positive_tweets[4000:]
    train_pos = all_positive_tweets[:4000]
    test_neg = all_negative_tweets[4000:]
    train_neg = all_negative_tweets[:4000]
    train_x = train_pos + train_neg
    test_x = test_pos + test_neg
    # avoid assumptions about the length of all_positive_tweets
    train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
    test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))
    return train_x, train_y, test_x, test_y
Beispiel #31
0
def corpusreader_demo():
    """
    Use :module:`TwitterCorpusReader` tp read a file of tweets, and print out

    * some full tweets in JSON format;
    * some raw strings from the tweets (i.e., the value of the `text` field); and
    * the result of tokenising the raw strings.

    """
    from nltk.corpus import twitter_samples as tweets

    print()
    print("Complete tweet documents")
    print(SPACER)
    for tweet in tweets.docs("tweets.20150430-223406.json")[:1]:
        print(json.dumps(tweet, indent=1, sort_keys=True))

    print()
    print("Raw tweet strings:")
    print(SPACER)
    for text in tweets.strings("tweets.20150430-223406.json")[:15]:
        print(text)

    print()
    print("Tokenized tweet strings:")
    print(SPACER)
    for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]:
        print(toks)
def getFeatures(numWordsToUse):

    # these emoticons cover 98.6% of the training data
    emoticons = [':)','(:',': )','( :','=)','(=','= )','( =',':D',': D',':p',': p',':(','):',': (',') :','=(',')=','= (',') =',':D',': D',':p',': p',':-)','(-:',':- )','( -:',':-(',')-:',':- (',') -:']
    emoticons = set(emoticons)
    # NLTK has their own twitter corpus with positive and negative messages
    positiveTweets = twitter_samples.strings('positive_tweets.json')
    negativeTweets = twitter_samples.strings('negative_tweets.json')

    positiveSentiment = [1 for x in positiveTweets]
    negativeSentiment = [0 for x in positiveTweets]

    tweets = positiveTweets + negativeTweets
    sentiment = positiveSentiment + negativeSentiment


    tokenizedTweets, cleanedSentiment = utils.tokenize(tweets, sentiment)


    cleanedTweets = []
    linesCleaned = 0
    for tweet in tokenizedTweets:
        replacedEmoticon = 0
        cleanedTweet = []
        for word in tweet:
            if word not in emoticons:
                cleanedTweet.append(word)
            else:
                replacedEmoticon = 1
        cleanedTweets.append(cleanedTweet)
        linesCleaned += replacedEmoticon

    global popularWords
    formattedTweets, sentiment, popularWords = utils.nlpFeatureEngineering(
            cleanedTweets, cleanedSentiment, 0, numWordsToUse, 'counts'
        )

    # transform list of dictionaries into a sparse matrix
    sparseFeatures = dv.fit_transform(formattedTweets)

    return sparseFeatures, sentiment
	def __init__(self):

		twitter = twitter_samples.strings('tweets.20150430-223406.json')
		news = brown.words(categories='news')

		twitter = Tweet.Tweet(text = ' '.join(twitter)).stem()
		news = Tweet.Tweet(text = ' '.join(news)).stem()

		self.twitter_freq = nltk.FreqDist(twitter)
		self.tsum = len(twitter)
		self.news_freq = nltk.FreqDist(news)
		self.nsum = len(news)
Beispiel #34
0
def main():
    """ 
    Replace the brown corpus with other corpora
    or use your own textfile like so:
        f = open(filename)
        t = f.read()
    """
    from nltk.corpus import twitter_samples 
    words = []
    for sentence in twitter_samples.strings():
        words += nltk.word_tokenize(sentence) + ["."]
    chain = WordChain(words)
    print chain.build_sentence()
def getFeatures(numWordsToUse):

    # NLTK has their own twitter corpus with positive and negative messages
    positiveTweets = twitter_samples.strings('positive_tweets.json')
    negativeTweets = twitter_samples.strings('negative_tweets.json')

    positiveSentiment = [1 for x in positiveTweets]
    negativeSentiment = [0 for x in positiveTweets]

    tweets = positiveTweets + negativeTweets
    sentiment = positiveSentiment + negativeSentiment


    tokenizedTweets, cleanedSentiment = utils.tokenize(tweets, sentiment)

    global popularWords
    formattedTweets, sentiment, popularWords = utils.nlpFeatureEngineering(
            tokenizedTweets, cleanedSentiment, 0, numWordsToUse, 'counts'
        )

    # transform list of dictionaries into a sparse matrix
    sparseFeatures = dv.fit_transform(formattedTweets)

    return sparseFeatures, sentiment
Beispiel #36
0
testing = allFeatures[len(allFeatures) / 2:]

subject_classifier = nltk.NaiveBayesClassifier.train(training)

fh = open('filter.txt', 'r')
labelSubjects = []
for line in fh:
		labelSubject = line.split(',')
		label = labelSubject[0].strip()
		subject = labelSubject[1].strip()
		labelSubjects.append( (label, subject) )

fh.close()

#seniment classifier
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')
pos_tuples = []
neg_tuples = []
for string in pos_tweets:
		pos_tuples.append((string, 'positive'))
for string in neg_tweets:
		neg_tuples.append((string, 'negative'))

tweets = []
#filtering out words that are user names, less than 3 characters, and http addresses
for (words, sentiment) in pos_tuples + neg_tuples:
		words_filtered = [e.lower() for e in words.split() if len(e) >= 3 and e[0] != '@' and e[0:4] != 'http']
		tweets.append((words_filtered, sentiment))

word_features = get_word_features(get_words_in_tweets(tweets))
Beispiel #37
0
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.corpus import twitter_samples
import unicodedata
from pymongo import MongoClient

strings_negative = twitter_samples.strings('negative_tweets.json')
strings_positive = twitter_samples.strings('positive_tweets.json')

neg_docs = [(unicodedata.normalize('NFKD', sent).encode('ascii','ignore').split(), -1) for sent in strings_negative]
pos_docs = [(unicodedata.normalize('NFKD', sent).encode('ascii','ignore').split(), 1) for sent in strings_positive]
# print len(neg_docs), len(pos_docs)
# print neg_docs[0]

train_neg_docs = neg_docs[:5000]
train_pos_docs = pos_docs[:5000]
training_docs = train_neg_docs+train_pos_docs

sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
training_set = sentim_analyzer.apply_features(training_docs)

trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)

'''
Beispiel #38
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from nltk.twitter import Twitter
import os


var = os.environ
os.environ["TWITTER"] = "C:/Users/admin/Documents/twitter-files"

tw = Twitter()
tw.tweets(keywords='algeria, algerie', limit=10)

# sample from the public stream

from nltk.corpus import twitter_samples
strings = twitter_samples.strings('tweets.20150430-223406.json')
for string in strings[:15]:
    print(string)
##Example: 
#1) Scrape tweets from Twitter that have #bucs, #buccaneers, or #siegetheday in their text and are in English
#2) Save these tweets as a row to a .csv file
import twitterscraper
with open('nflbucs.csv','a',newline = '',encoding='utf-8') as fil:
    writer = csv.writer(fil)
    for tweet in twitterscraper.query_tweets("%23bucs%20OR%20%23buccaneers%20OR%20%23siegetheday%20lang%3Aen%20include%3Aretweets", 1000):
        writer.writerow(tweet)


####Train classifier based on tweet data
#0) Load data and setup
from nltk.corpus import twitter_samples

##take a sample of data
twitter_samples.strings('positive_tweets.json')[1]
twitter_samples.strings('negative_tweets.json')[1]

##create function word_feats() to turn string into a dictionary
def word_feats(words):
    return dict([(word, True) for word in words])
 
 
#1) a) Tokenize tweets from sample data
#b) Use word_feats() to create a dictionary out of the tokenized words
#c) Create list variable of positive and negative features using the dictionary from (b) and append 'pos' or 'neg'
import nltk
posfeats = [(word_feats(nltk.TweetTokenizer(preserve_case = False).tokenize(row)),'pos') for row in twitter_samples.strings('positive_tweets.json')]
len(posfeats) #check length - equivalent to number of tweets
negfeats = [(word_feats(nltk.TweetTokenizer(preserve_case = False).tokenize(row)),'neg') for row in twitter_samples.strings('negative_tweets.json')]
len(negfeats) #check length - equivalent to number of tweets