Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('path', type=str, help='the path to training data')
    args = parser.parse_args()

    test_data_path = args.path
    ids, sentences = load_dev_set(test_data_path)

    nb = NB()
    nb.load_model('./nbmodel.txt')

    results = list()

    for s in sentences:
        re = nb.classify(s)
        results.append(re)

    save_results('./nboutput.txt', results, ids)
Ejemplo n.º 2
0
def main():
    os.system("clear")
    print "Sentiment Analysis by Luca Giacomel. Disclaimer: this very simple algorithm wont probably work, but it might be worth a try."
    
    def update_progress(progress,current_operation_message,p):
        df=2 #dimension factor, len of the graph = 100/df
        sys.stdout.write('\r[{0}{1}] {2}% (Page: {4}) Current operation: {3}\r\r'.format('#'*(progress/df)," "*(100/df-(progress/df)), progress,current_operation_message,p))
        sys.stdout.flush()
    
    load_from_hd="n"
    
    if os.path.exists("/tmp/db.bin") and os.path.exists("/tmp/neg.tweets") and os.path.exists("/tmp/pos.tweets"):
        proceed=raw_input("I found some tweets already stored, do you want me to use them [y=Yes | n=No | a=Append]? [y/N/a] ").lower()
        while proceed not in ["","y","n","a"]:
            proceed=raw_input("I found some tweets already stored, do you want me to use them? [y/N] ").lower()
        load_from_hd=proceed.lower()
            
    if load_from_hd=="y" or load_from_hd=="":
        test_tweets=[]
        nb=NaiveBayesClassifier(db_path="/tmp/db.bin",categories=['positive','negative'])
        print "Done. Read a db of %s words" % len(nb.db)
        search_value=raw_input("What keyword do you want to use to perform the analysis? (you can use @ # :) :( as special operators) ")
        print "Downloading 30 tweets for keywords %s.." % search_value
        z=json.loads(urllib.urlopen("http://search.twitter.com/search.json?q=%s&rpp=30&lang=en" % (urllib.quote(search_value))).read())
        print "Done."
        for m in z['results']:
            test_tweets.append(m['text'])
        
                    
    elif load_from_hd=="n" or load_from_hd=="a":
        pages_to_load=raw_input("How many pages should I load? [default=20] ")
        while 1:
            try:
                if pages_to_load=="":
                    pages_to_load=20
                    break
                pages_to_load=int(pages_to_load)
                break
            except:
                pages_to_load=raw_input("How many pages should I load? [default=20] ")
        
        if load_from_hd=="a":
            pos_tweets=json.load(open("/tmp/neg.tweets"))
            neg_tweets=json.load(open("/tmp/pos.tweets"))
        else:
            pos_tweets,neg_tweets=[],[]
        
        for p in range(1,pages_to_load+1):
            perc=int(float(p*100)/pages_to_load)
            isleep=0
            cycle=True
            while 1:
                try:
                    if cycle:
                        raw_pos_tweets=json.loads(urllib.urlopen("http://search.twitter.com/search.json?page=%s&q=%s&rpp=100&lang=en" % (p,urllib.quote(":)"))).read())
                        raw_neg_tweets=json.loads(urllib.urlopen("http://search.twitter.com/search.json?page=%s&q=%s&rpp=100&lang=en" % (p,urllib.quote(":("))).read()) 
                        if len(neg_tweets)<len(pos_tweets):
                            cycle=False
                    else:
                        raw_neg_tweets=json.loads(urllib.urlopen("http://search.twitter.com/search.json?page=%s&q=%s&rpp=100&lang=en" % (p,urllib.quote(":("))).read()) 
                        raw_pos_tweets=json.loads(urllib.urlopen("http://search.twitter.com/search.json?page=%s&q=%s&rpp=100&lang=en" % (p,urllib.quote(":)"))).read())
                        if len(neg_tweets)>len(pos_tweets):
                            cycle=True
                    raw_pos_tweets['results'],raw_neg_tweets['results']
                    time.sleep(1)
                    for i in raw_pos_tweets['results']:
                        if pos_tweets.count((i['text'],'positive'))==0:
                            pos_tweets.append((i['text'],'positive'))
                    for i in raw_neg_tweets['results']:
                        if neg_tweets.count((i['text'],'negative'))==0:
                            neg_tweets.append((i['text'],'negative'))
                    update_progress(perc, "Elements: %s positive, %s negative." % (len(pos_tweets),len(neg_tweets)),p)
                    break
                except:
                    update_progress(perc, "Failed to fetch the json, trying again in %s seconds" % 2**isleep ,p)
                    time.sleep(2**isleep)
                    isleep+=1
                    if 2**isleep>64:
                        update_progress(perc, "Load time >64sec. Skipping page.. "+str(p),p)
                        break                    
        update_progress(perc, "\n",p)
        open("/tmp/pos.tweets","w").write(json.dumps(pos_tweets))
        open("/tmp/neg.tweets","w").write(json.dumps(neg_tweets))

        training_start=time.time()
        
        index=min(len(pos_tweets),len(neg_tweets))
        test_tweets=[]
        search_value=raw_input("What keyword do you want to use to perform the analysis? (you can use @ # :) :( as special operators) ")
        print "Downloading 30 tweets for keywords %s.." % search_value
        z=json.loads(urllib.urlopen("http://search.twitter.com/search.json?q=%s&rpp=30&lang=en" % (urllib.quote(search_value))).read())
        print "Done."
        for m in z['results']:
            test_tweets.append(m['text'])
        print "Training the classifier. This might take a while, grab a coffe while I work."

        nb=NaiveBayesClassifier(db={},categories=['negative','positive'])
        nb.train(pos_tweets[:index]+neg_tweets[:index])        
        
        print "Done. Training based on a set of %s elements took %s seconds." % (index*2,time.time()-training_start)
    
    for tx in test_tweets:
        print "Tweet: "+OKBLUE+tx+ENDC
        r=nb.classify(tx.lower())
        if r=="positive":
            print "Result: "+OKGREEN+r+ENDC
        elif r=="negative":
            print "Result: "+FAIL+r+ENDC
        #else:
        #print "Result: "+WARNING+"neutral (was %s with accuracy %s)" % (r[0],r[1]) +ENDC
            
    nb.save_to_hard_disk()
    
    nb.show_most_informative()
from naive_bayes import NaiveBayesClassifier
from data_harvester import data
import random
import pdb
from collections import defaultdict

random.seed(0)
train_data, test_data = split_data(data, 0.75)
print("train_data_length = %s" % len(train_data))
print("test_data_length = %s" % len(test_data))

classifier = NaiveBayesClassifier()
# pdb.set_trace()
classifier.train(train_data)
classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

true_positives = []
true_negatives = []
false_positives = []
false_negatives = []
subject, classification, predicted_prob = 0, 1, 2
for my_tuple in classified:
    is_spam = my_tuple[classification]
    predict_is_spam = (my_tuple[predicted_prob] > 0.5)
    # if predict_is_spam:
    #     print('hey ho!')
    # pdb.set_trace()
    if is_spam and predict_is_spam:
        true_positives.append(my_tuple[subject])
from sklearn.model_selection import train_test_split
from naive_bayes import NaiveBayesClassifier

# Upload Dataset
spams = pd.read_csv("spam.csv", engine="python")

# Clean the DataFrame
spams = spams.dropna(axis=1)
spams.columns = ["spam", "body"]
spams = spams[["body", "spam"]]

# Encode the label
spams["spam"] = LabelEncoder().fit_transform(spams["spam"])

emails = spams["body"]
labels = spams["spam"]

X_train, X_test, y_train, y_test = train_test_split(emails,
                                                    labels,
                                                    test_size=0.3)

train_data = pd.concat([X_train, y_train], axis=1)

# Train and classify
nc = NaiveBayesClassifier()
nc.train(train_data)
print(nc)
print(nc.classify("sign up today and win a prize"))
print(nc.classify("At what time would you like to meet"))

# Notes: this type of models work better on small datasets