Ejemplo n.º 1
0
def generateClassifier():
    train = getIntentDataset()

    cl = NaiveBayesClassifier(train)
    cl.show_informative_features(5)    
    path = "/media/University/UniversityDisc/2-Master/MasterThesis/EjecucionTesis/Desarrollo/PythonProjects/QueryAnalyzer/Models/"
    saveTrainedClassifier(path, cl, "intent_classifier_2.pickle")
    def generate_model(self):
        print("Gathering and processing tweets...")
        # Shuffle list of username-label tuples
        tuple_list = usermapping.data_tuples.items()

        # Split and grab tweets for users
        results = utils.flatten([ self.fetch_data(t)
                                  for t in tuple_list ])
         
        # TODO: Cross-validation generation
        trn_ratio = int(len(results) * 0.85)
        shuffle(results)
        print(len(results))
        print(trn_ratio)
        train = results[:trn_ratio]
        test = results[trn_ratio:]

        # Instantiate and train classifier
        print("Training...")
        cl = NaiveBayesClassifier(train)
        cl.train()
        
        # Save model
        print("Saving model...")
        utils.save_model(cl)

        # Classify test
        print("Testing...")
        print("Accuracy: {0}".format(cl.accuracy(test)))
        return cl
Ejemplo n.º 3
0
def generateIntentionalityClassifier():
    db = dbClient()    
    training = db.training
    cursor = training.find()    
    
    #Reducir la cantidad de registros 
    crs = list(cursor)    
    random.shuffle(crs)
    # split into 90% training and 10% test sets
    p = int(len(crs) * .01)
    cr_test = crs[0:p]        
        
    print "Test", len(cr_test)    
    
    data = []
    t = ""
    for td in cr_test:
        tgram = td["triGram"]
        label = td["label"] 
        #print tgram
        for tg in tgram:
            d = '-'.join(tg)
            t = t + " " + d
        #print t
        data.append((t, label))
        t = ""
    #print data
    cl = NaiveBayesClassifier(data)
    cl.show_informative_features(30)    
    path = "/media/University/UniversityDisc/2-Master/MasterThesis/EjecucionTesis/Desarrollo/PythonProjects/QueryAnalyzer/Models/"
    saveTrainedClassifier(path, cl, "my_classifier_v6.pickle")
    return cl
def create_sentiment():
    """
        Train sentiment model and save.

        Input type: None 
        Output: Model as pickle 
    """

    random.seed(1)

    test = [
        ("The dude presenting Unravel seems like one of the most genuine game developers Ive ever seen I really hope this game works out for him",'pos'),
        ("His hands are shaking Dude looks so stoked and scared at the same time",'pos'),
        ("Right I just felt like I was watching his dream come true It was nice The game looks very well done as well Good for him",'pos'),
        ("Seriously Unravel looks really good actually and honestly seeing him so happy about what hes made is contagious I want to see more of Unravel ",'pos'),
        ("He was so nervous shaking all over his voice quivering",'neg'),
        ("The game looked nice too very cute art style ",'pos'),
        ("You could tell he genuinely wanted to be there it looked like he was even shaking from the excitement  I hope it works out for them aswell",'pos'),
        ("However following that up with the weird PvZ thing was odd To say the least",'neg'),
        ("Haha The game did look nice though Im definitely going to keep an eye on it I enjoy supporting such hopeful developers",'pos'),
        ("Very personable This looks like a buy for me As a dev in a other sector I appreciate this passion",'pos'),
        ("I want to give him a cookie",'pos'),
        ("Im getting a copy Im gonna support my indie devs",'pos'),
        ("The twitch leak was accurate It was like a play by play you start speaking French then switch to English",'neg'),
        ("yep exactly what i was thinking lol its important to note that the twitch leak never had them saying it was Dishonored 2 but that they were honored to be here very different",'neg'),
        ("Honored  Im 100 sure that was intentional",'neg'),
        ("oh yea for sure but wasnt solid enough evidence imo to be like dishonored 2 confirmed just based off that",'neg'),
        ("The confirmation was who was talking not what they were talking about ",'neg'),
        ("How awkward is it for a pop singer to perform at a video game conference",'neg'),
        ("Oh god did they warn him that he will get zero reaction",'neg'),
        ("I really hope so",'pos'),
        ("Almost as bad as Aisha f*****g up her dialogue constantly Shes doing alright though E3 is really becoming a mainstream media event Hollywood has nothing like this ComicCon is the only comparison and they dont dazzle it up like E3",'neg')
        ]


    # Grab review data
    reviews = [
        (list(movie_reviews.words(fileid)), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)
        ]
    random.shuffle(reviews)

    # Divide into 10% train/test splits
    new_train, new_test = reviews[:1900], reviews[1900:]

    # Train the NB classifier on the train split
    cl = NaiveBayesClassifier(new_train)

    # Compute accuracy
    accuracy = cl.accuracy(test + new_test)
    print("Accuracy: {0}".format(accuracy))

    # Show 5 most informative features
    cl.show_informative_features(5)

    # Save model for use in creating social model sentiment
    with open('sentiment_clf_full.pkl', 'wb') as pk:
        pickle.dump(cl, pk)
    print 'done saving model'
Ejemplo n.º 5
0
def get_analysis(s):

    train = [
        ('I love this sandwich.', 'pos'),
        ('This is an amazing place!', 'pos'),
        ('I feel very good about these beers.', 'pos'),
        ('This is my best work.', 'pos'),
        ("What an awesome view", 'pos'),
        ('I do not like this restaurant', 'neg'),
        ('I am tired of this stuff.', 'neg'),
        ("I can't deal with this", 'neg'),
        ('He is my sworn enemy!', 'neg'),
        ('My boss is horrible.', 'neg')
    ]


    cl = NaiveBayesClassifier(train)

    tweets = Tweet.objects.filter(search_term = s)

    result = []

    for t in tweets:
        d = {}
        c = cl.classify(t.tw_text)
        d['text'] = t.tw_text
        d['res'] = c
        result.append(d)

    return result
Ejemplo n.º 6
0
 def __init__(self):
     training_data = self._load_data("data")
     self.category_classifier = NaiveBayesClassifier(
         [(x[0], x[1]) for x in training_data])
     self.avoidability_classifier = NaiveBayesClassifier(
         [(x[0], x[2]) for x in training_data])
     self.ordinary_classifier = NaiveBayesClassifier(
         [(x[0], x[3]) for x in training_data])
Ejemplo n.º 7
0
def train_n_test(file_path):
	documents= load_data(file_path)
	random.shuffle(documents)
	generate_bigrams(data.wordlist)	
	train = documents[0:110]
	test = documents[110:]
	#classifier = NaiveBayesClassifier(train)
	#classifier = NaiveBayesClassifier(train,feature_extractor=get_features)
	classifier = NaiveBayesClassifier(train,feature_extractor=get_feats)
	print classifier.accuracy(test)
Ejemplo n.º 8
0
class NaiveBayesAnalyzer:
    cl = None

    def __init__(self):
        with open("training_data.json", "r") as f:
            self.cl = NaiveBayesClassifier(f, format="json")
        self.cl.show_informative_features(20)

    def analyze(self, text):
        return self.cl.classify(text)
Ejemplo n.º 9
0
class LanguageDetector(object):
    def __init__(self, train=SAMPLE_TRAIN, feature_extractor=FeatureExtractors.last_word_extractor()):
        self.train = train
        self.classifier = NaiveBayesClassifier(self.train, feature_extractor)
    
    def accuracy(self, test_set=SAMPLE_TEST):
        return self.classifier.accuracy(test_set)

    def show_features(self):
        return self.classifier.show_informative_features(5)
def main():
    json = raw_input("Where is the json training set?")
    print "Program start", time.ctime() #debug
    with open(json, 'r') as file:
        classifier = NaiveBayesClassifier(file, format='json')
        print "Classifier done!", time.ctime() #debug
    test = raw_input("Where is the test eml_folder?")
    print "Testing...", time.ctime()
    for emails in dir_list(test):
        print classifier.classify(emails)
    print "Testing done", time.ctime()
Ejemplo n.º 11
0
def run_test(train, test, name):
   print "Training..."
   cll = NaiveBayesClassifier(train)
   print "Done training\n"
   accuracy = cll.accuracy(test)
   print "Accuracy: " + str(accuracy)

   # get matching lists of predicted and true labels
   pred_labels = list()
   true_labels = list()
   for obj in test:
      prob_label = cll.prob_classify(obj[0]).max()
      true_label = obj[1]
      true_labels.append(true_label)
      pred_labels.append(prob_label)

   # transform our labels to numbers
   labels = cll.labels()
   i = 0
   label_num = dict()
   for label in labels:
      label_num[label] = i
      i = i + 1

   # match our predicted and true labels with the number representations
   true_label_nums = list()
   pred_label_nums = list()
   for true_l, pred_l in zip(true_labels, pred_labels):
      true_label_nums.append(label_num[true_l])
      pred_label_nums.append(label_num[pred_l])

   cm = confusion_matrix(true_label_nums, pred_label_nums)
   print cm
   print "\n"

   with open("test_results.txt", "a") as tr:
      tr.write(str(name) + "\n")
      tr.write(str(accuracy) + "\n")
      tr.write(str(cm))
      tr.write("\n\n")

   import matplotlib.pyplot as plt
   fig = plt.figure()
   ax = fig.add_subplot(111)
   cax = ax.matshow(cm)
   plt.title("Confusion Matrix For "+name)
   fig.colorbar(cax)
   ax.set_xticklabels(['']+labels)
   ax.set_yticklabels(['']+labels)
   plt.xlabel("Predicted")
   plt.ylabel("True")
   plt.savefig('plots/'+name+'.pdf', bbox_inches='tight') 
def main():
	print "This is Naive Bayes' Classifier..."

	#read training data
	#training_data = open("training_data").readlines()
	training_data = open("training_data_final").readlines()
	#load training data
	training_tuples = loadData(training_data)

	training_tuples_api = make_api_tuples(training_tuples)
	print training_tuples_api

	#display tuples
	#for t in training_tuples:
	#	t.show()

	#gather classes
	classes = filterClasses(training_tuples)
	#print "classes = ", classes

	#gather vocab
	vocab = getVocab(training_tuples)
	#print vocab

	#generate prior
	prior = generatePrior(training_tuples, classes)
	#print prior

	#generate likelihood
	likelihood = generateLikelihood(training_tuples, vocab, classes)
	#print likelihood

	#read test data
        #test_data = open("test_data").readlines()
        test_data = open("test_data_final").readlines()
        #load test data
        test_tuples = loadData(test_data)

	test_tuples_api = make_api_tuples(test_tuples)
	#calculate C-MAP
	posterior = predict(test_tuples, classes, prior, likelihood)
	showResults(training_data, test_data, posterior)

	#calculate accuracy
	evaluateAccuracy(test_tuples, posterior)

	#Naive Bayes API
	cl = NaiveBayesClassifier(training_tuples_api)
	# Compute accuracy
	print("Accuracy: {0}".format(cl.accuracy(test_tuples_api)))
Ejemplo n.º 13
0
def train(pos_examples, neg_examples, train_fraction=0.6):
    """Train a classifier, holding out train_fraction of pos_examples and neg_examples as a test set.
    Return the tuple:
        
        (the classifier, accuracy, positive test example list, negative test example list, )

    """

    pos_split = int(train_fraction * len(pos_examples))
    pos_train, pos_test = pos_examples[0:pos_split], pos_examples[pos_split:]
    neg_split = int(train_fraction * len(neg_examples))
    neg_train, neg_test = neg_examples[0:neg_split], neg_examples[neg_split:]

    cl = NaiveBayesClassifier(pos_train + neg_train)
    return cl, cl.accuracy(pos_test + neg_test), pos_test, neg_test
Ejemplo n.º 14
0
 def train(self, train_set):
     train_data = []
     for t in train_set:
         train_data.append((self._cvobj_to_string(t[0]),t[1]))
     print "Training model..."
     #print train_data
     self.cl = NaiveBayesClassifier(train_data)
Ejemplo n.º 15
0
class TimeLogicAdapter(LogicAdapter):
    """
    The TimeLogicAdapter returns the current time.
    """

    def __init__(self, **kwargs):
        super(TimeLogicAdapter, self).__init__(**kwargs)

        training_data = [
            ('what time is it', 1),
            ('do you know the time', 1),
            ('do you know what time it is', 1),
            ('what is the time', 1),
            ('it is time to go to sleep', 0),
            ('what is your favorite color', 0),
            ('i had a great time', 0),
            ('what is', 0)
        ]

        self.classifier = NaiveBayesClassifier(training_data)

    def process(self, statement):
        now = datetime.now()

        confidence = self.classifier.classify(statement.text.lower())
        response = Statement('The current time is ' + now.strftime('%I:%M %p'))

        return confidence, response
Ejemplo n.º 16
0
class TimeLogicAdapter(LogicAdapter):

    def __init__(self, **kwargs):
        super(TimeLogicAdapter, self).__init__(**kwargs)

        training_data = [
            ("what time is it", 1),
            ("do you know the time", 1),
            ("do you know what time it is", 1),
            ("what is the time", 1),
            ("do you know the time", 0),
            ("it is time to go to sleep", 0),
            ("what is your favorite color", 0),
            ("i had a great time", 0),
            ("what is", 0)
        ]

        self.classifier = NaiveBayesClassifier(training_data)

    def process(self, statement):
        now = datetime.now()

        confidence = self.classifier.classify(statement.text.lower())
        response = Statement("The current time is " + now.strftime("%I:%M %p"))

        return confidence, response
Ejemplo n.º 17
0
class HelpLabeler(object):
    HELP_DATA = 'help_data.json'
    def __init__(self):
        with open(self.HELP_DATA, 'r') as fp:
            self.c = NaiveBayesClassifier(fp, format="json")
        with open(self.HELP_DATA, 'r') as fp:
            self.help_json = {}
            for i in json.load(fp):
                self.help_json[i['text']] = i['label']

    def get_label(self, text, lower_placeholders=[]):
        text = text.lower()
        self.save_help(text)
        prob_dist = self.c.prob_classify(text)
        label = prob_dist.max()
        prob = round(prob_dist.prob(label), 2)
        if prob > 0.7:
            return(label)
        else:
            return(None)

    def save_help(self, lower_text):
        try:
            self.help_json[lower_text]
        except KeyError:
            self.help_json[lower_text] = 'unknown'

        with open(self.HELP_DATA, 'w') as fp:
            json.dump([{'text': k, 'label': v} for k, v in self.help_json.items()], fp, indent=4)
Ejemplo n.º 18
0
 def __init__(self):
     with open(self.LABELS_DATA, 'r') as fp:
         self.c = NaiveBayesClassifier(fp, format="json")
     with open(self.LABELS_DATA, 'r') as fp:
         self.labels_json = {}
         for i in json.load(fp):
             self.labels_json[i['text']] = i['label']
Ejemplo n.º 19
0
 def __init__(self, **kwargs):
     super(WikipediaAdapter, self).__init__(**kwargs)
     training_data = [
         ("what do you know about", 1),
         ('what is', 1),
         ('who is', 1),
         ('who was', 1),
         ('where is',1),
         ('Could you tell me', 1),
         ('what can you tell me about', 1),
         ("what's trending in ",0),
         ('what is trending in', 0),
         ('what is going on with', 0),
         ('how are you', 0),
         ('how is', 0),
         ('how are', 0),
         ('how will', 0),
         ('how would you', 0),
         ('what people are talking about', 0),
         ('what are reviews', 0),
         ('what the', 0),
         ('do you know the time', 0),
         ('it is time to go to sleep', 0),
         ('what is your favorite color', 0),
         ('i had a great time', 0),
         ('what time is it', 0),
         ('do you know the time', 0),
         ('do you know what time it is', 0),
         ('what is the time', 0),
         ('how are you?', 0),
         ('any clue about', 1),
         ('located',1)
     ]
     self.classifier = NaiveBayesClassifier(training_data)
Ejemplo n.º 20
0
    def __init__(self, data="AllData.csv"):
        """Load in the previous data (by default from AllData.csv) and initialise the classifier"""
        if os.path.exists(data):
            self.prev_data = pd.read_csv(data)
        else:
            self.prev_data = pd.DataFrame(columns=['date', 'desc', 'amount', 'cat'])

        self.classifier = NaiveBayesClassifier(self._get_training(self.prev_data), self._extractor)
class TimeLogicAdapter(LogicAdapter):
    """
    The TimeLogicAdapter returns the current time.
    """

    def __init__(self, **kwargs):
        super(TimeLogicAdapter, self).__init__(**kwargs)

        training_data = [
            ("what time is it", 1),
            ("do you know the time", 1),
            ("do you know what time it is", 1),
            ("what is the time", 1),
            ("do you know the time", 0),
            ("it is time to go to sleep", 0),
            ("what is your favorite color", 0),
            ("i had a great time", 0),
            ("what is", 0)
        ]

        self.classifier = NaiveBayesClassifier(training_data)

    def process(self, statement, tag_processing = None):

        user_input = statement.text.lower()
        if "time" not in user_input:
            return 0, Statement("")

        try:
            # Find the time zone of the user based on latitude and longitude to get the correct time
            g          = geocoders.GoogleV3()
            user       = tag_processing.user
            lat,lon    = user.get_latitude_longitude()
            timezone   = g.timezone((lat,lon))

            now = datetime.now(timezone)

            confidence = self.classifier.classify(user_input)
            response = Statement("The current time is " + now.strftime("%I:%M %p"))
        except:
            confidence = self.classifier.classify(user_input)
            response = Statement("Sorry. I cannot find the current time. Possible bad user location based on latitude and longitude. Please try again later")

        return confidence, response
class TestValidators(TestCase):

    def setUp(self):
        self.data = StringIO('{}')
        self.classifier = NaiveBayesClassifier(self.data, format='json')
        self.classifier.update([
            ('spam spam spam', 'spam'),
            ('this is not spam', 'valid'),
        ])

        self.mock_classifier_get = mock.patch.object(
            ClassifierValidator,
            'get_classifier',
            mock.Mock(return_value=self.classifier)
        )
        self.patch_classifier_get = self.mock_classifier_get.start()

    def test_validator_pass(self):
        validate = ClassifierValidator()
        validate('this is totally legit')

    def test_validator_invalid(self):
        validate = ClassifierValidator()
        with self.assertRaises(ValidationError):
            validate('spam spammy spam')

    def test_validator_invalid_different_exception(self):
        validate = ClassifierValidator(raises=ValueError)
        with self.assertRaises(ValueError):
            validate('spam spammy spam')

    @mock.patch('textclassifier.classifier.TEXTCLASSIFIER_DATA_FILE', '')
    def test_open_file_failure(self):
        """Open file, but still validate after errors"""
        self.mock_classifier_get.stop()
        mod_name = ('builtins', '__builtin__')[(sys.version_info < (3,0))]
        with mock.patch('{0}.open'.format(mod_name)) as mocked_open:
            mocked_open.side_effect = IOError
            with self.assertRaises(IOError):
                DefaultClassifier()
            validate = ClassifierValidator()
            validate('spam spam spam')
Ejemplo n.º 23
0
 def __setitem__(self,key,value):
     self.intents[key] = value
     
     # train classifier
     phrase_file = file(value.phrases,'r')
     phrase_data = yaml.safe_load(phrase_file)
     phrases = [(phrase,value.name) for phrase in phrase_data['Phrases']]
     
     if self.classifier:
         self.classifier.update(phrases)
     else:
         self.classifier = Classifier(phrases)
Ejemplo n.º 24
0
class ExpenseClassifier:

    def __init__(self):
        training_data = self._load_data("data")
        self.category_classifier  = NaiveBayesClassifier([(x[0], x[1]) for x in  training_data])
        self.avoidability_classifier = NaiveBayesClassifier([(x[0], x[2]) for x in  training_data])
        self.ordinary_classifier =  NaiveBayesClassifier([(x[0], x[3]) for x in  training_data])

    def classify(self, description):
        res = {}
        res['category'] = self.category_classifier.classify(description)
        res['avoidable'] = self.avoidability_classifier.classify(description)
        res['ordinary'] = self.ordinary_classifier.classify(description)
        return res

    def accuracy(self):
        test_data = self._load_data("test")
        res = {}
        res['category'] = self.category_classifier.accuracy([(x[0], x[1]) for x in test_data])
        res['avoidable'] = self.avoidability_classifier.accuracy([(x[0], x[2]) for x in test_data])
        res['ordinary'] = self.ordinary_classifier.accuracy([(x[0], x[3]) for x in test_data])
        return res

    def _load_data(self, folder):
        data = []
        for f in glob.glob(folder + "/*.csv"):
            with open(f) as csvfile:
                spamreader = csv.reader(csvfile, delimiter=',')
                for row in spamreader:
                    if row[DESCRIPTION] and row[CATEGORY] and row[AVOIDABLE] and row[ORDINARY]:
                        data.append((norm(row[DESCRIPTION]), row[CATEGORY], row[AVOIDABLE], row[ORDINARY]))
        return data
Ejemplo n.º 25
0
def main(argv=0):
    nBObj = naiveBayes()
    businessId = nBObj.deriveBusinessId('yelp_academic_dataset_business.json')
    print len(businessId)
    businessId = businessId[:10]
    train = nBObj.getTrainData('yelp_academic_dataset_review.json',businessId)
 
    print train
    cl = NaiveBayesClassifier(train)

    print cl.show_informative_features(20) 
    print "Opening the file..."
    target = open("naiveBayesResult.txt", 'w')

    for (sentence,rating) in nBObj.testSentences:
        clOutput = nBObj.testSentence(sentence,cl)
        strToWrite = str(rating) + "\t" + clOutput
        target.write(strToWrite)
        target.write("\n")
      
    target.close()
    nBObj.calcAccuracy()
Ejemplo n.º 26
0
def nayebayesreport(fileFullPath):
    print  "nayebayesreport came"
    print (fileFullPath)
    sentimentDtls = []
    patternCountMap = {
                       "Negative" : 0,
                       "Positive" : 0,
                       "Neutral" : 0,
                       "Total" : 0,
                       }
    
    
    cl = NaiveBayesClassifier(getTrainData())

    print "train data loaded"
    with open(fileFullPath, 'r') as f:
        for line in f:
            try:
                print line
                if line and len(line.strip()) > 0:
                    trainedResult = cl.classify(line)
                        
                    patternResult = "Negative"
                    if "pos" == trainedResult:
                        patternResult = "Positive"
                    
                    patternCountMap[patternResult] = patternCountMap[patternResult] + 1
                    patternCountMap["Total"] = patternCountMap["Total"] + 1
                    
                    sentimentDtls.append({
                                          "sentiment" : patternResult,
                                          "feedback" : line
                                         })
            except Exception:
                print(traceback.format_exc())
                print(line)
    
    addBayesClassifierResult(sentimentDtls)
    return
Ejemplo n.º 27
0
class TwitterTrendAdapter(LogicAdapter):
    def __init__(self, **kwargs):
        super(TwitterTrendAdapter, self).__init__(**kwargs)

        training_data = [
            ("what's trending in ", 1),
            ('what is trending in', 1),
            ('what is', 0),
            ('who is', 0),
            ('who was', 0),
            ('what can you tell me about', 0),
            ('what do you know about', 0),
            ('any clue about', 0),
            ('where is',0),
            ('located', 0),
            ('what is happening', 1)
        ]

        self.classifier = NaiveBayesClassifier(training_data)

    def process(self, statement):
        confidence = self.classifier.classify(statement.text.lower())
        tokens = nltk.word_tokenize(str(statement))
        tagged = nltk.pos_tag(tokens)
        nouns = [word for word, pos in tagged if (pos == 'NN' or pos == 'NNP' or pos =='JJ' or pos == 'NNS' or pos == 'NNPS')]
        auth = OAuthHandler(twitter_consumer_key, twitter_consumer_secret)
        auth.set_access_token(twitter_access_key, twitter_access_secret)
        api = tweepy.API(auth)
        trendsName = ""
        for noun in nouns:
            try:
                html = urllib.urlopen(
                    'http://where.yahooapis.com/v1/places.q(' + noun + ')?appid=' + yahoo_client_Id).read()
                soup = BeautifulSoup(html, 'html.parser')
                woeids = soup.find('woeid').contents
                for woeid in woeids:
                    id = ' '.join(woeid.string.split())
                    trends1 = api.trends_place(str(id))
                    data = trends1[0]
                    # grab the trends
                    trends = data['trends']
                    names1 = [trend['name'] for trend in trends]
                    trendsName += ' '.join(names1)
            except:
                pass
        if len(nouns) != 0 and len(trendsName)!=0:
            response = Statement("Jarvis: "+trendsName)
        else:
            response = Statement("")
            confidence=0
        return confidence, response
def create_sentiment_model():

    random.seed(1)

    # Grab some movie review data
    reviews = [(list(movie_reviews.words(fileid)), category)
                  for category in movie_reviews.categories()
                  for fileid in movie_reviews.fileids(category)]
    random.shuffle(reviews)
    new_train, new_test = reviews[:1900], reviews[1900:]

    cl = NaiveBayesClassifier(new_train)

    # Compute accuracy
    accuracy = cl.accuracy(new_test)
    print("Accuracy: {0}".format(accuracy))

    # Show 5 most informative features
    print cl.show_informative_features(5)

    with open('sentiment_clf_full.pkl', 'wb') as pk:
        dill.dump(cl, pk)
    print 'done saving model'
    def __init__(self, **kwargs):
        super(WeatherLogicAdapter, self).__init__(**kwargs)

        self.tagger = POSTagger()
        self.pyowm_api_key = kwargs.get("pyowm_api_key")
        self.DAYS = {0:'Monday',1:'Tuesday',2:'Wednesday',3:'Thursday',4:'Friday',5:'Saturday',6:'Sunday'}

        ## Train a classifier to recognize when a user is asking for the weather forecast
        ## around them or for a particular area.
        ## Data labeled with `1` is for around user, label with `0` is for particular area.
        training_data = [
                ("get the weather", 1),
                ("get weather", 1),
                ("what is the weather", 1),
                ("what's the weather", 1),
                ("whats the weather", 1),
                ("whats the forecast", 1),
                ("what is the weather", 1),
                ("what is the forecast", 1),
                ("what the weather", 1),
                ("what the forecast", 1),
                ("tell me the weather", 1),
                ("do you know the weather", 1),
                ("get the forecast", 1),
                ("get forecast", 1),
                ("what is the forecast", 1),
                ("what's the forecast", 1),
                ("tell me the forecast", 1),
                ("do you know the forecast", 1),
                ("around me", 1),
                ("my weather", 1),
                ("my forecast", 1),
                ("my extended weather", 1),
                ("my extended forecast", 1),
                ("check the weather", 1),
                ("check the forecast", 1),

                ("weather in", 0),
                ("weather for", 0),
                ("what is the weather in", 0),
                ("what is the weather for", 0),
                ("forecast in", 0),
                ("forecast for", 0),
                ("what is the forecast in", 0),
                ("what is the forecast for", 0),
                ("check the weather in", 0),
                ("check the forecast in", 0),
            ]

        self.classifier = NaiveBayesClassifier(training_data)
    def setUp(self):
        self.data = StringIO('{}')
        self.classifier = NaiveBayesClassifier(self.data, format='json')
        self.classifier.update([
            ('spam spam spam', 'spam'),
            ('this is not spam', 'valid'),
        ])

        self.mock_classifier_get = mock.patch.object(
            ClassifierValidator,
            'get_classifier',
            mock.Mock(return_value=self.classifier)
        )
        self.patch_classifier_get = self.mock_classifier_get.start()
Ejemplo n.º 31
0
 def __init__(self, art_dict, search_term=''):
     self.article_blobs_dict = self.create_clean_sentences(art_dict, search_term)
     self.classifier = NaiveBayesClassifier(self.generate_training_data(self.article_blobs_dict))
     print('finished training classifier')
Ejemplo n.º 32
0
class QueryAdapter(LogicAdapter):
    def __init__(self, **kwargs):
        super(QueryAdapter, self).__init__(**kwargs)
        training_file = '%s/../database/%s.json' % (os.path.dirname(
            os.path.realpath(inspect.getfile(
                self.__class__))), kwargs.get('training_file_for_query'))
        training_database = json.load(open(training_file))['data']
        training_data = [(data, int(classe)) for classe in ["0", "1"]
                         for data in training_database[classe]]
        self.classifier = NaiveBayesClassifier(training_data)

        self.partners = Database('partners_fake',
                                 parse_db=True)  #default fixed to partners
        self.fields = Database(
            'fields')  #lexical fields of different features in db
        self.clf = self.train_feature_finder(
            self.fields.db, RandomForestClassifier(n_estimators=20))

    def process(self, statement):

        confidence = self.classifier.classify(statement.text.lower())
        entry, query = self.build_query(statement.text.lower())
        if entry is None:
            if query is None:
                response = Statement(
                    'Je sais que tu cherches a savoir quelquechose sur les partners LV, mais je vais avoir besoin que tu me clarifies tout ça !'
                )
            else:
                response = Statement(
                    "Hmm tu cherches un/une %s. Donne moi plus d'indice !" %
                    query)
        else:
            if query is None:
                response = Statement(
                    "Hmm tu cherches une info a propos de %s... Dis moi ce que tu veux savoir exactement stp !"
                    % entry[1])
            else:
                element = self.get_element(entry, query)
                response = Statement(
                    "Voila ce que j'ai pour toi : \n\t %s, %s" %
                    (entry[1], element))

        return confidence, response

    def train_feature_finder(self, training_db, clf):
        training_sentences = []
        c = 0
        training_classes = []
        self.class_names = []
        self.vectorizer = CountVectorizer(analyzer = "word",   \
                              tokenizer = None,    \
                              preprocessor = None, \
                              stop_words = None,   \
                              max_features = 500)
        for key, value in training_db.iteritems():
            training_sentences += value
            training_classes += [c for i in range(len(value))]
            c += 1
            self.class_names.append(key)
        train_data_features = self.vectorizer.fit_transform(training_sentences)
        train_data_features = train_data_features.toarray()
        clf = clf.fit(train_data_features, training_classes)
        return clf

    def predict_feature(self, sentence):
        sentence_vect = self.vectorizer.transform([sentence])
        sentence_vect = sentence_vect.toarray()
        class_id = self.clf.predict(sentence_vect)
        class_id = class_id[0]
        feature = self.class_names[class_id]
        return feature

    def predict_filter_key(self, sentence):
        for chunk in sentence.split():
            for feature in self.partners.index:
                if self.is_in_field_of_value(chunk,
                                             self.partners.index[feature]):
                    entry = (feature, chunk)
                    return entry

    def get_element(self, entry, query):
        for partner in self.partners.db:
            if entry[1] in partner[entry[0]]:
                return partner[query]
        return None

    def is_in_field_of_value(self, chunk, list_of_values):
        return (chunk in list_of_values)

    def build_query(self, statement):
        entry = None
        query = None
        query = self.predict_feature(statement)
        entry = self.predict_filter_key(statement)

        return entry, query
Ejemplo n.º 33
0
from textblob .classifiers import NaiveBayesClassifier
trainData=[]
f=open('TrainSet.txt','r')
data=f.readline().strip()
while data:
	splitData=data.split(',')
	category=splitData[0]
	content=splitData[1]
	tuple=content,category
	trainData.append(tuple)
	data=f.readline().strip()
classifier=NaiveBayesClassifier(trainData)
print "Training Done"
f.close()
f1=open('TestSet.txt','r')
data=f1.read()
if classifier.classify(data)=='C01':
	print "Bacterial Infections and Mycoses"
else:
	print "Virus Diseases" 
Ejemplo n.º 34
0
def ans_type(question):
    cl = NaiveBayesClassifier(train)
    return cl.classify(question)  # "pos"
Ejemplo n.º 35
0
        #    feats["aboveAverage({0})".format(token)] = False
        else:
            feats["diffFromAverage({0})".format(token)] = False
        #   feats["aboveAverage({0})".format(token)] = False
        #  feats["belowAverage({0})".format(token)] = False

    return feats


def naivebayes_extractor(document):
    tokens = document.split()
    features = dict((u'contains({0})'.format(w), True) for w in tokens)
    return features


cl = NaiveBayesClassifier(trainlist, feature_extractor=naivebayes_extractor)

index = 0

index = 0
predicted = []
actual = []
#print(cl.accuracy(devlist))
for tweet in testlist:
    predicted.append(cl.classify(tweet[0]))
    actual.append(tweet[1])
print("gamma = " + str(gamma))

c = 0
for i in range(500):
    if (predicted[i] == "twitter"):
            curr = fp.readline()
            cnt += 1
    return tweets


got_pos_tweets = loadtweets(positive_got_tweet_file, 1, [])
got_neg_tweets = loadtweets(negative_got_tweet_file, -1, [])
got_neu_tweets = loadtweets(neutral_got_tweet_file, 0, [])

train_set = got_pos_tweets + got_neg_tweets + got_neu_tweets

# train classifier
from textblob.classifiers import NaiveBayesClassifier

print(" Training Classifier -------------------")
classifier = NaiveBayesClassifier(train_set)
print(" Training Classifier Complete -------------------")

# creating some variables to store info
polarity = 0
positive = 0
negative = 0
neutral = 0

# Got episode tweets
got_ep_tweet_file = "/Users/krishna/PycharmProjects/thatgotapi/got-ep4.txt"


def plotDefPoints(positive, negative, neutral, total):
    labels = [
        'Positive [' + str(positive) + ']', 'Neutral [' + str(neutral) + ']',
random.seed(1)

train = [('I love this sandwich.', 'pos'),
         ('This is an amazing place!', 'pos'),
         ('I feel very good about these beers.', 'pos'),
         ('This is my best work.', 'pos'), ("What an awesome view", 'pos'),
         ('I do not like this restaurant', 'neg'),
         ('I am tired of this stuff.', 'neg'),
         ("I can't deal with this", 'neg'), ('He is my sworn enemy!', 'neg'),
         ('My boss is horrible.', 'neg')]
test = [('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'),
        ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'),
        ('Gary is a friend of mine.', 'pos'),
        ("I can't believe I'm doing this.", 'neg')]

cl = NaiveBayesClassifier(train)

# Grab some movie review data
reviews = [(list(movie_reviews.words(fileid)), category)
           for category in movie_reviews.categories()
           for fileid in movie_reviews.fileids(category)]
random.shuffle(reviews)
new_train, new_test = reviews[0:100], reviews[101:200]

# Update the classifier with the new training data
cl.update(new_train)

# Compute accuracy
accuracy = cl.accuracy(test + new_test)
print("Accuracy: {0}".format(accuracy))
Ejemplo n.º 38
0
class Emote(object):

    runningImport = False  # Is Emote being used as a library or class?
    runningScript = False  # Or is Emote being run as a script directly?
    firstTime = True  # Emote running for the first time?

    pickledOn = False  # Is a pickled database detected?

    fullCount = ""  # The string result detailing the full amount of classifications (sorted by type and frequency) that the current training database contains

    writtenAnalysis = False  # Turn writte analysis on?
    levelsAnalysis = True  # Turn full levels analysis on?
    defaultCorpus = ""  # What's the default corpus?

    def __init__(self,
                 message="",
                 pre_result="",
                 prob_dist=0,
                 prob_dist_max=0,
                 positive=0,
                 negative=0,
                 joy=0,
                 anger=0,
                 love=0,
                 hate=0,
                 certainty=0,
                 boredom=0,
                 intensity=0,
                 regret=0,
                 challenging=0,
                 agreeable=0,
                 desire=0,
                 calm=0,
                 sarcastic=0,
                 emphatic=0,
                 pride=0,
                 accusative=0,
                 admiration=0,
                 inquisitive=0,
                 modest=0,
                 instructive=0,
                 ambivalence=0,
                 vulgarity=0,
                 train=[],
                 cl=NaiveBayesClassifier([]),
                 punctCountDict={},
                 wordCount=0,
                 sentenceCount=0,
                 normalizedProbValues={},
                 sentences=[],
                 sentencesProbValues=[],
                 massResults=[]):

        # COPY AND PASTE ALL OF BASE_CORPUS.TXT INTO SELF.TRAIN BELOW FOR TRAINING NEW MODELS
        # When reading base_corpus into list for training (in initialTrain function, we run into errors with escaped chars)
        self.train = []

        self.message = message
        self.punctCountDict = punctCountDict
        self.wordCount = wordCount
        self.sentenceCount = sentenceCount

        self.pre_result = pre_result
        self.prob_dist = prob_dist
        self.prob_dist_max = prob_dist_max

        self.positive = positive
        self.negative = negative
        self.joy = joy
        self.anger = anger
        self.love = love
        self.hate = hate
        self.certainty = certainty
        self.boredom = boredom
        self.intensity = intensity
        self.regret = regret
        self.challenging = challenging
        self.agreeable = agreeable
        self.desire = desire
        self.calm = calm
        self.sarcastic = sarcastic
        self.emphatic = emphatic
        self.pride = pride
        self.accusative = accusative
        self.admiration = admiration
        self.inquisitive = inquisitive
        self.modest = modest
        self.instructive = instructive
        self.ambivalence = ambivalence
        self.vulgarity = vulgarity

        self.prob_dist = prob_dist
        self.prob_dist_max = prob_dist_max
        self.cl = cl
        self.normalizedProbValues = normalizedProbValues
        self.sentences = sentences
        self.sentencesProbValues = sentencesProbValues
        self.massResults = massResults

    def getInput(self, _message):
        global firstTime
        global runningScript
        global runningImport
        if runningScript == True:
            if firstTime == False:
                self.message = input('\n\tWrite message to be analyzed: ')
                _message = self.message
                self.countPunct(_message)
                self.countWordSent(_message)
                self.runAnalysis(_message)
            else:
                print(
                    """\n\tNow starting Emote as a script. Use Emote Mass Analyzer to break down a text into individual sentence 
                 classifications, or import Emote as a library.""")
                print(
                    "\n\tThe first time you run the analysis will be a little bit slower."
                )
                firstTime = False
                self.initialTrain()
        else:
            if firstTime == True:
                # print("\nFIRST TIME IS TRUE")
                print("\n\tRunning Emote as a library..")
                self.message = _message
                runningImport = True
                self.countPunct(_message)
                self.countWordSent(_message)
                self.runAnalysis(_message)
            else:
                # print("\nFIRST TIME IS FALSE")
                runningImport = True
                self.message = _message
                self.countPunct(_message)
                self.countWordSent(_message)
                self.runAnalysis(_message)

    def initialTrain(self):

        # For interchangable corpuses.. uncomment line below
        # selectedCorpus = input('\n\tEnter the name of the corpus file to load (Press enter to load default, from base_corpus.py): ')
        global defaultCorpus
        global pickledOn
        global fullCount

        # if selectedCorpus != defaultCorpus and selectedCorpus != "":
        # defaultCorpus = selectedCorpus
        # elif selectedCorpus == "":
        # defaultCorpus = defaultCorpus
        # else:
        # defaultCorpus = "base_corpus.txt"
        selectedCorpus = defaultCorpus

        try:
            dir = os.path.abspath(os.path.dirname(__file__))
            path = os.path.join(dir, 'data', 'base_corpus.pickle')
            with open(path, 'rb') as fp:
                size = os.path.getsize(path)
                if size > 0:
                    pickledOn = True
                    print("\n\tPickled data found!")
                else:
                    pass
                fp.close()
        except IOError as err:
            pickledOn = False
            dir = os.path.abspath(os.path.dirname(__file__))
            path = os.path.join(dir, 'data', 'base_corpus.pickle')
            print(
                "\n\tNo pickled data found.. now creating and loading pickle.."
            )

        # Training data
        if pickledOn == False:

            # Code below takes training data from text file input
            path = os.getcwd()
            path = os.path.join(path, 'data', selectedCorpus)
            data = codecs.open(path, 'r', encoding='utf-8').read().splitlines()

            # self.train = data # Getting errors with reading base_corpus.txt into list

            fullDatabase = str(self.train)
            countPositive = fullDatabase.count("'positive')", 0,
                                               len(fullDatabase))
            countNegative = fullDatabase.count("'negative')", 0,
                                               len(fullDatabase))
            countLove = fullDatabase.count("'love')", 0, len(fullDatabase))
            countHate = fullDatabase.count("'hate')", 0, len(fullDatabase))
            countJoy = fullDatabase.count("'joy')", 0, len(fullDatabase))
            countAnger = fullDatabase.count("'anger')", 0, len(fullDatabase))
            countCertainty = fullDatabase.count("'certainty'", 0,
                                                len(fullDatabase))
            countConfusion = fullDatabase.count("'confusion'", 0,
                                                len(fullDatabase))
            countAmusement = fullDatabase.count("'amusement'", 0,
                                                len(fullDatabase))
            countBoredom = fullDatabase.count("'boredom'", 0,
                                              len(fullDatabase))
            countIntensity = fullDatabase.count("'intensity'", 0,
                                                len(fullDatabase))
            countRegret = fullDatabase.count("'regret'", 0, len(fullDatabase))
            countAgreeable = fullDatabase.count("'agreeable'", 0,
                                                len(fullDatabase))
            countChallenging = fullDatabase.count("'challenging'", 0,
                                                  len(fullDatabase))
            countDesire = fullDatabase.count("'desire'", 0, len(fullDatabase))
            countCalm = fullDatabase.count("'calm'", 0, len(fullDatabase))
            countEmphatic = fullDatabase.count("'emphatic'", 0,
                                               len(fullDatabase))
            countSarcastic = fullDatabase.count("'sarcastic'", 0,
                                                len(fullDatabase))
            countInstructive = fullDatabase.count("'instructive'", 0,
                                                  len(fullDatabase))
            countAccusative = fullDatabase.count("'accusative'", 0,
                                                 len(fullDatabase))
            countAdmiration = fullDatabase.count("'admiration'", 0,
                                                 len(fullDatabase))
            countInquisitive = fullDatabase.count("'inquisitive'", 0,
                                                  len(fullDatabase))
            countModest = fullDatabase.count("'modest'", 0, len(fullDatabase))
            countPride = fullDatabase.count("'pride'", 0, len(fullDatabase))
            countAmbivalence = fullDatabase.count("'ambivalence'", 0,
                                                  len(fullDatabase))
            countVulgarity = fullDatabase.count("'vulgarity'", 0,
                                                len(fullDatabase))

            fullCount = "\n\tNumbers and types of classifications in loaded database: \n"+ "\t\tPositive: " + str(countPositive) + "\t" + "Negative: " + str(countNegative) + \
            "\t\tJoy: " + str(countJoy) + "\t\t" + "Anger: " + str(countAnger) + "\t\tCertainty: " + str(countCertainty) + "\t" + "Confusion: " + str(countConfusion) + \
            "\t\tCertainty: " + str(countCertainty) + "\t" + "Confusion: " + str(countConfusion) + "\t\tAmusement: " + str(countAmusement) + "\t" + "Boredom: " + str(countBoredom) + \
            "\t\tIntensity: " + str(countIntensity) + "\t" + "Regret: " + str(countRegret) + "\t\tAgreeable: " + str(countAgreeable) + "\t" + "Challenging: " + str(countChallenging) + \
            "\t\tDesire: " + str(countDesire) + "\t" + "Calm: " + str(countCalm) + "\t\tEmphatic: " + str(countEmphatic) + "\t" + "Sarcastic: " + str(countSarcastic) + \
            "\t\tInstructive: " + str(countInstructive) + "\t" + "Accusative: " + str(countAccusative) + "\t\tAdmiration: " + str(countAdmiration) + "\t" + "Inquisitive: " + str(countInquisitive) + \
            "\t\tAdmiration: " + str(countAdmiration) + "\t" + "Inquisitive: " + str(countInquisitive) + "\t\tAmbivalence: " + str(countAmbivalence) + "\t" + "Vulgarity: " + str(countVulgarity)
            "\t\tJoy: " + str(countJoy) + "\t\t" + "Anger: " + str(countAnger) + "\t\tCertainty: " + str(countCertainty) + "\t" + "Confusion: " + str(countConfusion) + \
            "\t\tCertainty: " + str(countCertainty) + "\t" + "Confusion: " + str(countConfusion) + "\t\tAmusement: " + str(countAmusement) + "\t" + "Boredom: " + str(countBoredom) + \
            "\t\tIntensity: " + str(countIntensity) + "\t" + "Regret: " + str(countRegret) + "\t\tAgreeable: " + str(countAgreeable) + "\t" + "Challenging: " + str(countChallenging) + \
            "\t\tDesire: " + str(countDesire) + "\t" + "Calm: " + str(countCalm) + "\t\tEmphatic: " + str(countEmphatic) + "\t" + "Sarcastic: " + str(countSarcastic) + \
            "\t\tInstructive: " + str(countInstructive) + "\t" + "Accusative: " + str(countAccusative) + "\t\tAdmiration: " + str(countAdmiration) + "\t" + "Inquisitive: " + str(countInquisitive) + \
            "\t\tAdmiration: " + str(countAdmiration) + "\t" + "Inquisitive: " + str(countInquisitive) + "\t\tAmbivalence: " + str(countAmbivalence) + "\t" + "Vulgarity: " + str(countVulgarity)

            print(
                """\n\tNumbers and types of classifications in database to be loaded: \n"""
            )
            print("\t\tPositive: " + str(countPositive) + "\t" + "Negative: " +
                  str(countNegative))
            print("\t\tLove: " + str(countLove) + "\t\t" + "Hate: " +
                  str(countHate))
            print("\t\tJoy: " + str(countJoy) + "\t\t" + "Anger: " +
                  str(countAnger))
            print("\t\tCertainty: " + str(countCertainty) + "\t" +
                  "Confusion: " + str(countConfusion))
            print("\t\tAmusement: " + str(countAmusement) + "\t" +
                  "Boredom: " + str(countBoredom))
            print("\t\tIntensity: " + str(countIntensity) + "\t" + "Regret: " +
                  str(countRegret))
            print("\t\tAgreeable: " + str(countAgreeable) + "\t" +
                  "Challenging: " + str(countChallenging))
            print("\t\tDesire: " + str(countDesire) + "\t" + "Calm: " +
                  str(countCalm))
            print("\t\tEmphatic: " + str(countEmphatic) + "\t" +
                  "Sarcastic: " + str(countSarcastic))
            print("\t\tInstructive: " + str(countInstructive) + "\t" +
                  "Accusative: " + str(countAccusative))
            print("\t\tAdmiration: " + str(countAdmiration) + "\t" +
                  "Inquisitive: " + str(countInquisitive))
            print("\t\tModest: " + str(countModest) + "\t" + "Pride: " +
                  str(countPride))
            print("\t\tAmbivalence: " + str(countAmbivalence) + "\t" +
                  "Vulgarity: " + str(countVulgarity))

            print("\n\tOpening training data.")
            # print(str(type(self.train)))
            # print(str(self.train))

            random.seed(1)
            random.shuffle(self.train)
            self.cl = NaiveBayesClassifier(self.train)
            print("\n\tTraining now..")

            # shelvedData["base"] = cl # SHELF vs PICKLE

            dir = os.path.abspath(os.path.dirname(__file__))
            path = os.path.join(dir, 'data', 'base_corpus.pickle')

            fp = open(path, 'wb')
            print("\n\tLoaded training data into pickle file.")
            pickle.dump(self.cl, fp, protocol=pickle.HIGHEST_PROTOCOL)
            fp.close()
            print(
                "\n\tPickling complete, and will be loaded as the default database corpus next time, skipping the training period."
            )
            currentTime = datetime.datetime.now().time()
            print("\n\n\tTIME NEW DATABASE FINISHED TRAINING AND SAVING: ",
                  currentTime)
            # shelvedData.close() # SHELF vs PICKLE
        if pickledOn == True:
            try:
                # shelvedData = shelve.open("base_corpus.dat") # SHELF VS PICKLE
                path = os.getcwd()
                path = os.path.join(path, 'data', 'base_corpus.pickle')
                fp = open(path, 'rb')
                # path = os.getcwd()
                # path = os.path.join(path, 'data', 'base_corpus.pickle')
                fp = open(path, "rb")
                self.cl = pickle.load(fp)
                fp.close()
                print("\n\tTraining has been loaded from the selected corpus.")
                print("\t\t" + fullCount)
            except IOError as err:
                print(
                    "\n\tError training pickle file.. system will exit. Go into the directory, delete the corrupt pickle file, and retry this script to train a new copy."
                )
                print("\n\tPath was at:", path)
                sys.exit()
            pass
        if runningImport == True:
            self.runAnalysis(_message)
        else:
            self.getInput(_message)

    def countPunct(self, _message):
        numberCount = 0
        periodCount = 0
        commaCount = 0
        exclamationPtCount = 0
        questionMkCount = 0
        for char in _message:
            if char.isdigit() == True:
                numberCount += 1
            elif char == '.':
                periodCount += 1
            elif char == ',':
                commaCount += 1
            elif char == '!':
                exclamationPtCount += 1
            elif char == '?':
                questionMkCount += 1
            else:
                pass
        self.punctCountDict = {
            "numbers": numberCount,
            "periods_end": periodCount,
            "question_marks": questionMkCount,
            "exclamation_points": exclamationPtCount,
            "commas": commaCount
        }
        return self.punctCountDict

    def countWordSent(self, _message):
        _messageSplitWords = _message.split()
        _messageSplitSent = sent_tokenize(_message)
        self.wordCount = len(_messageSplitWords)
        # print("\n\tWord count in message: " + str(self.wordCount))
        self.sentenceCount = len(_messageSplitSent)
        # print("\n\tSentence count in message: " + str(self.sentenceCount))
        return self.wordCount, self.sentenceCount

    def split_into_sentences(self, _message):
        sentenceTempValStore = []
        self.normalizedProbValues = []
        self.sentences = sent_tokenize(_message)
        if len(self.sentences) > 1:
            for i in self.sentences:
                self.runAnalysis(str(i))
                self.sentencesProbValues.append(self.normalizedProbValues)
            return self.sentencesProbValues
        else:
            pass

    def analyzeCSV(self, path):
        csvData = []
        csvTextData = []
        file = open(path, 'r')
        csv_file = csv.reader(file, delimiter=",")
        for row in csv_file:
            csvData.append(row[0])
            csvTextData.append(row[1])
        file.close()
        print("\n\t", csvData)
        print("\n\t", csvTextData)
        print("\n\t", csvTextData)
        print("\n\t", csvData)
        self.massResults = []
        for i in range(len(csvTextData)):
            self.runAnalysis(csvTextData[i])
            print(emote.normalizedProbValues)
            self.massResults.append(self.normalizedProbValues)
        # path = os.getcwd()
        # path = os.path.join(path, 'static', 'results.csv')
        dir = os.path.abspath(os.path.dirname(__file__))
        path = os.path.join(dir, 'static', 'results.csv')
        csvFile = open(path, 'w', newline='')
        for i in range(len(self.massResults)):
            csvIndRowList = []
            csvResults = csv.writer(csvFile, delimiter=',')
            csvIndRowList.append(csvData[i])
            csvIndRowList.append(csvTextData[i])
            csvIndRowList.append(self.massResults[i][0])
            csvIndRowList.append(self.massResults[i][1])
            csvIndRowList.append(self.massResults[i][2])
            csvIndRowList.append(self.massResults[i][3])
            csvIndRowList.append(self.massResults[i][4])
            csvIndRowList.append(self.massResults[i][5])
            print("\n\tROW LIST", csvIndRowList)
            csvResults.writerow(csvIndRowList)
        csvFile.close()
        return csvResults, csvFile, self.massResults

    def runAnalysis(self, _message):
        global runningImport
        global firstTime
        global runningScript
        if firstTime == True and runningImport == True:
            print(
                "\n\n\t\t(First time running analysis.. load pickle data. The initial analysis will be slower because of the models loading, and so is automatically run in the beginning.)"
            )
            try:
                # path = os.getcwd()
                # path = os.path.join(path, 'data', 'base_corpus.pickle')
                # Use the path below when uploading to Pythonanywhere
                # dir = os.path.dirname(__file__)
                dir = os.path.abspath(os.path.dirname(__file__))
                path = os.path.join(dir, 'data', 'base_corpus.pickle')
                with open(path, 'rb') as fp:
                    self.cl = pickle.load(fp)
                    runningImport = False
                    firstTime = False
                print("\n\tFinished loading pickle.")
            except:
                print(
                    "\n\tError gathering pickle file in /data directory. Application exit."
                )
                print("\n\tThis was the attempted path that was searched: ",
                      path)
        print("\n\tAnalyzing " + "'" + str(_message) + "'" + "..")
        self.prob_dist = self.cl.prob_classify(_message)
        self.prob_dist_max = self.prob_dist.max()
        self.positive = round(self.prob_dist.prob("positive"), 4)
        self.negative = round(self.prob_dist.prob("negative"), 4)
        self.joy = round(self.prob_dist.prob("joy"), 4)
        self.anger = round(self.prob_dist.prob("anger"), 4)
        self.love = round(self.prob_dist.prob("love"), 4)
        self.hate = round(self.prob_dist.prob("hate"), 4)
        self.certainty = round(self.prob_dist.prob("certainty"), 4)
        self.confusion = round(self.prob_dist.prob("confusion"), 4)
        self.amusement = round(self.prob_dist.prob("amusement"), 4)
        self.boredom = round(self.prob_dist.prob("boredom"), 4)
        self.intensity = round(self.prob_dist.prob("intensity"), 4)
        self.regret = round(self.prob_dist.prob("regret"), 4)
        self.agreeable = round(self.prob_dist.prob("agreeable"), 4)
        self.challenging = round(self.prob_dist.prob("challenging"), 4)
        self.desire = round(self.prob_dist.prob("desire"), 4)
        self.calm = round(self.prob_dist.prob("calm"), 4)
        self.emphatic = round(self.prob_dist.prob("emphatic"), 4)
        self.sarcastic = round(self.prob_dist.prob("sarcastic"), 4)
        self.instructive = round(self.prob_dist.prob("instructive"), 4)
        self.accusative = round(self.prob_dist.prob("accusative"), 4)
        self.admiration = round(self.prob_dist.prob("admiration"), 4)
        self.inquisitive = round(self.prob_dist.prob("inquisitive"), 4)
        self.modest = round(self.prob_dist.prob("modest"), 4)
        self.pride = round(self.prob_dist.prob("pride"), 4)
        self.ambivalence = round(self.prob_dist.prob("ambivalence"), 4)
        self.vulgarity = round(self.prob_dist.prob('vulgarity'), 4)

        valueList = [
            self.positive, self.negative, self.joy, self.anger, self.love,
            self.hate, self.certainty, self.confusion, self.amusement,
            self.boredom, self.intensity, self.regret, self.agreeable,
            self.challenging, self.desire, self.calm, self.emphatic,
            self.sarcastic, self.instructive, self.accusative, self.admiration,
            self.inquisitive, self.modest, self.ambivalence, self.vulgarity
        ]

        posNegAbsVal = math.fabs(self.positive - self.negative)
        # Do some normalization if the difference between the positive and negative tone values is too small
        # since positive and negative should lie on opposite sides of the spectrum
        if posNegAbsVal <= .25:
            self.positive = self.positive * math.sqrt(
                self.positive) * math.sqrt(self.positive) * math.sqrt(
                    self.positive) * math.sqrt(self.positive)
            self.negative = self.negative * math.sqrt(
                self.negative) * math.sqrt(self.negative) * math.sqrt(
                    self.negative) * math.sqrt(self.negative)
        else:
            pass

        if runningScript == True:
            print("\n\tProbability Values From NaiveBayesClassifier: ")
            print("\tStrongest Emotion: " + self.prob_dist_max)
            print("\tPositive: " + str(self.positive) + "\tNegative: " +
                  str(self.negative))
            print("\tJoy: " + str(self.joy) + "\tAnger: " + str(self.anger))
            print("\tLove: " + str(self.love) + "\tHate: " + str(self.hate))
            print("\tCertainty: " + str(self.certainty) + "\tConfusion: " +
                  str(self.confusion))
            print("\tAmusement: " + str(self.amusement) + "\tBoredom: " +
                  str(self.boredom))
            print("\tIntensity: " + str(self.intensity) + "\tRegret: " +
                  str(self.regret))
            print("\tAgreeable: " + str(self.agreeable) + "\tChallenging: " +
                  str(self.challenging))
            print("\tDesire: " + str(self.desire) + "\tCalm: " +
                  str(self.calm))
            print("\tEmphatic: " + str(self.emphatic) + "\tSarcastic: " +
                  str(self.sarcastic))
            print("\tInstructive: " + str(self.instructive) +
                  "\tAccusative: " + str(self.accusative))
            print("\tAdmiration: " + str(self.admiration) + "\tInquisitive: " +
                  str(self.inquisitive))
            print("\tModest: " + str(self.modest) + "\tPride: " +
                  str(self.pride))
            print("\tAmbivalence: " + str(self.ambivalence) + "\tVulgarity: " +
                  str(self.vulgarity))

            self.normalizedProbValues = pd.Series({
                'positive': self.positive,
                'negative': self.negative,
                'joy': self.joy,
                'anger': self.anger,
                'love': self.love,
                'hate': self.hate,
                'certainty': self.certainty,
                'confusion': self.confusion,
                'amusement': self.amusement,
                'boredom': self.boredom,
                'intensity': self.intensity,
                'regret': self.regret,
                'agreeable': self.agreeable,
                'challenging': self.challenging,
                'desire': self.desire,
                'calm': self.calm,
                'emphatic': self.emphatic,
                'sarcastic': self.sarcastic,
                'instructive': self.instructive,
                'accusative': self.accusative,
                'admiration': self.admiration,
                'inquisitive': self.inquisitive,
                'modest': self.modest,
                'pride': self.pride,
                'ambivalence': self.ambivalence,
                'vulgarity': self.vulgarity
            })
            # print("Noramlized prob values: " + str(self.normalizedProbValues))
            # print("\n\t",self.normalizedProbValues)
            # print("\n\t", self.normalizedProbValues.describe())
            self.normalizeProbabilityPunctuation(_message)
        else:
            self.normalizeProbabilityPunctuation(_message)
            return self.normalizedProbValues

    def normalizeProbabilityPunctuation(self, _message):
        print("\tNow normalizing probability based on punctuation count..")
        # print("\n\t", self.punctCountDict)
        # print("\tNow normalizing probability based on punctuation count..")
        ############################################################################################################################################################
        # Base values below. Variables will be scaled off of linearly increasing relationships based off these values below, to determine different probability ranges.
        minWordCountRange = 0
        minSentenceCountRange = 0
        maxWordCountRange = 50
        maxSentenceCountRange = 3
        maxCommaCountRange = 6
        msgWordCountLeveler = 0
        msgSentenceCountLeveler = 0
        punctSlidingThreshold = 1
        # Code below contains the actual sliding algorithm for probability normalization through punctuation
        # START (The values in this if-then don't need to be sliding (mapped to a range), because anything longer than 50 words or 2 sentences will be considered "long").
        # This part of the algorithm is also not adjusted by the leveler, because the progression does not scale well enough based off the original values without manipulation.
        # Manipulation come from the msgWordCountLeveler and msgSentenceCountLeveler variables
        if minWordCountRange < self.wordCount < maxWordCountRange and minSentenceCountRange < self.sentenceCount <= maxSentenceCountRange:
            # print("\tProbability normalization based off of the first level of scaling.")
            punctSlidingThreshold = 1
            # Emphatic sentences more likely more likely (deep analytical thinking)
            # Values below are mapped to linearly scaling variables (to save having to numbers manually and repeatedly, of course).
            # PunctSlidingThreshold not used for commas for this instance case because multiplying by 1 does not give a high enough threshold
            # if minWordCountRange < self.wordCount < maxWordCountRange and self.sentenceCount >= maxSentenceCountRange and self.punctCountDict['commas'] <= 3:
            #     print("\tLong, slow writing, with many commas.")
            # elif minWordCountRange < self.wordCount < maxWordCountRange and self.sentenceCount < maxSentenceCountRange and self.punctCountDict['commas'] <= 3:
            #     print("\tQuick, rapid writing. Many short sentences, few commas.")
            # else:
            #     pass
            if self.punctCountDict['numbers'] >= punctSlidingThreshold:
                # More informative or descriptive message more likely
                # print("\tNumbers detected.")
                pass
            elif self.punctCountDict['periods_end'] >= punctSlidingThreshold:
                # print("\tPeriods detected.")
                pass
            elif self.punctCountDict['question_marks'] >= punctSlidingThreshold:
                if self.inquisitive <= .1:
                    self.inquisitive = .1
                else:
                    self.inquisitive = self.inquisitive / math.sqrt(
                        self.inquisitive
                    ) * self.punctCountDict['question_marks']
                # print("\tQuestions detected.")
            elif self.punctCountDict[
                    'exclamation_points'] >= punctSlidingThreshold:
                if self.intensity <= .1:
                    self.intensity = .1
                else:
                    self.intensity = self.intensity / math.sqrt(
                        self.intensity
                    ) * self.punctCountDict['exclamation_points']
                    # print("\tExclamations detected.")
            elif self.punctCountDict['commas'] >= punctSlidingThreshold * 1.5:
                # print("\tCommas detected.")
                pass
            else:
                pass
        # END
        # START
        if self.wordCount > maxWordCountRange or minSentenceCountRange > maxSentenceCountRange:
            # print("\tProbability normaliziation based off of a proportionally increased level of scaling from word / sentence count.")
            msgWordCountLeveler = int(self.wordCount / maxWordCountRange)
            msgSentenceCountLeveler = int(self.sentenceCount /
                                          maxSentenceCountRange)
            minWordCountRange = 1 * msgWordCountLeveler
            minSentenceCountRange = 1 * msgSentenceCountLeveler
            maxWordCountRange = maxWordCountRange * msgWordCountLeveler
            maxSentenceCountRange = minSentenceCountRange * msgSentenceCountLeveler
            # Make sure we're not dividing by 0
            if msgSentenceCountLeveler < 1:
                msgSentenceCountLeveler = 1
            punctSlidingThreshold = int(
                (punctSlidingThreshold *
                 (msgSentenceCountLeveler * msgWordCountLeveler /
                  msgSentenceCountLeveler)))
            if minWordCountRange < self.wordCount < maxWordCountRange and minSentenceCountRange < self.sentenceCount < maxSentenceCountRange:
                # Emphatic sentences more likely more likely (deep analytical thinking)
                # print("\tLong sentence detected.")
                # Punctuation threshold for commas are slightly higher than end marks, so they are multiplied by 1.5
                # if minWordCountRange < self.wordCount < maxWordCountRange and self.sentenceCount >= maxSentenceCountRange and self.commas < int(punctSlidingThreshold) * 1.5:
                #     print("\tQuick, rapid writing. Many short sentences, few commas.")
                # if minWordCountRange < self.wordCount < maxWordCountRange and self.sentenceCount < maxSentenceCountRange and self.commas >= int(punctSlidingThreshold) * 1.5:
                #     print("\tLong, slow writing, with many commas.")
                if self.punctCountDict['numbers'] >= punctSlidingThreshold:
                    # More informative or descriptive message more likely
                    # print("\tNumbers detected.")
                    pass
                elif self.punctCountDict[
                        'periods_end'] >= punctSlidingThreshold:
                    # print("\tPeriods detected.")
                    pass
                elif self.punctCountDict[
                        'question_marks'] >= punctSlidingThreshold:
                    if self.inquisitve <= .1:
                        self.inquisitve = .1
                    else:
                        self.inquisitive = self.inquisitive / math.sqrt(
                            self.inquisitive
                        ) * self.punctCountDict['question_marks']
                    # print("\tQuestions detected.")
                elif self.punctCountDict[
                        'exclamation_points'] >= punctSlidingThreshold:
                    if self.intensity <= .1:
                        self.intensity = .1
                    else:
                        self.intensity = self.intensity / math.sqrt(
                            self.intensity
                        ) * self.punctCountDict['exclamation_points']
                        # print("\tExclamations detected.")
                elif self.punctCountDict[
                        'commas'] >= punctSlidingThreshold * 1.5:
                    # print("\tCommas detected.")
                    pass
                else:
                    pass
            # END
        ############################################################################################################################################################
        # print("\n\tProbability Values Post-Normalization Counting Punctuation: ")
        # print(self.normalizedProbValues)
        # self.normalizeProbabilityOpposites(_message)
        print("\n\t", self.punctCountDict)
        self.normalizedProbValues = pd.Series({
            'positive': self.positive,
            'negative': self.negative,
            'joy': self.joy,
            'anger': self.anger,
            'love': self.love,
            'hate': self.hate,
            'certainty': self.certainty,
            'confusion': self.confusion,
            'amusement': self.amusement,
            'boredom': self.boredom,
            'intensity': self.intensity,
            'regret': self.regret,
            'agreeable': self.agreeable,
            'challenging': self.challenging,
            'desire': self.desire,
            'calm': self.calm,
            'emphatic': self.emphatic,
            'sarcastic': self.sarcastic,
            'instructive': self.instructive,
            'accusative': self.accusative,
            'admiration': self.admiration,
            'inquisitive': self.inquisitive,
            'modest': self.modest,
            'pride': self.pride,
            'ambivalence': self.ambivalence,
            'vulgarity': self.vulgarity
        })
        self.normalizeProbability(_message)
        # return self.normalizedProbValues

    def normalizeProbability(self, _message):

        print("\n\tNow standardizing probabilities into percentages..")

        self.normalizedProbValues = self.normalizedProbValues.values.reshape(
            1, -1)

        # self.normalizedProbValues = preprocessing.RobustScaler(with_centering=True, with_scaling=True, quantile_range=(50.0, 100.0), copy = True).fit_transform(self.normalizedProbValues)
        self.normalizedProbValues = preprocessing.StandardScaler(
            with_mean=False,
            with_std=False).fit_transform(self.normalizedProbValues)

        self.normalizedProbValues = preprocessing.normalize(
            self.normalizedProbValues, norm='max')
        self.normalizedProbValues = np.array(
            self.normalizedProbValues).tolist()

        normalizedProbValTemp = self.normalizedProbValues

        # print(self.normalizedProbValues)

        # LIST BELOW IS SORTED ALPHABETICALLY BECAUSE OF HOW NUMPY DOES IT

        normalizedAccusative = normalizedProbValTemp[0][0]
        normalizedAdmiration = normalizedProbValTemp[0][1]
        normalizedAgreeable = normalizedProbValTemp[0][2]
        normalizedAmbivalence = normalizedProbValTemp[0][3]
        normalizedAmusement = normalizedProbValTemp[0][4]
        normalizedAnger = normalizedProbValTemp[0][5]
        normalizedBoredom = normalizedProbValTemp[0][6]
        normalizedCalm = normalizedProbValTemp[0][7]
        normalizedCertainty = normalizedProbValTemp[0][8]
        normalizedChallenging = normalizedProbValTemp[0][9]
        normalizedConfusion = normalizedProbValTemp[0][10]
        normalizedDesire = normalizedProbValTemp[0][11]
        normalizedEmphatic = normalizedProbValTemp[0][12]
        normalizedHate = normalizedProbValTemp[0][13]
        normalizedInquisitive = normalizedProbValTemp[0][14]
        normalizedInstructive = normalizedProbValTemp[0][15]
        normalizedIntensity = normalizedProbValTemp[0][16]
        normalizedJoy = normalizedProbValTemp[0][17]
        normalizedLove = normalizedProbValTemp[0][18]
        normalizedModest = normalizedProbValTemp[0][19]
        normalizedNegative = normalizedProbValTemp[0][20]
        normalizedPositive = normalizedProbValTemp[0][21]
        normalizedPride = normalizedProbValTemp[0][22]
        normalizedRegret = normalizedProbValTemp[0][23]
        normalizedSarcastic = normalizedProbValTemp[0][24]
        normalizedVulgarity = normalizedProbValTemp[0][25]

        self.positive = float(round(normalizedPositive, 3) * 100)
        self.negative = float(round(normalizedNegative, 3) * 100)
        self.joy = float(round(normalizedJoy, 3) * 100)
        self.anger = float(round(normalizedAnger, 3) * 100)
        self.love = float(round(normalizedLove, 3) * 100)
        self.hate = float(round(normalizedHate, 3) * 100)
        self.certainty = float(round(normalizedCertainty, 3) * 100)
        self.confusion = float(round(normalizedConfusion, 3) * 100)
        self.amusement = float(round(normalizedAmusement, 3) * 100)
        self.boredom = float(round(normalizedBoredom, 3) * 100)
        self.intensity = float(round(normalizedIntensity, 3) * 100)
        self.regret = float(round(normalizedRegret, 3) * 100)
        self.agreeable = float(round(normalizedAgreeable, 3) * 100)
        self.challenging = float(round(normalizedChallenging, 3) * 100)
        self.desire = float(round(normalizedDesire, 3) * 100)
        self.calm = float(round(normalizedCalm, 3) * 100)
        self.emphatic = float(round(normalizedEmphatic, 3) * 100)
        self.sarcastic = float(round(normalizedSarcastic, 3) * 100)
        self.instructive = float(round(normalizedInstructive, 3) * 100)
        self.accusative = float(round(normalizedAccusative, 3) * 100)
        self.admiration = float(round(normalizedAdmiration, 3) * 100)
        self.inquisitive = float(round(normalizedInquisitive, 3) * 100)
        self.modest = float(round(normalizedModest, 3) * 100)
        self.pride = float(round(normalizedPride, 3) * 100)
        self.ambivalence = float(round(normalizedAmbivalence, 3) * 100)
        self.vulgarity = float(round(normalizedVulgarity, 3) * 100)

        normalizedProbValTemp = {}

        normalizedProbValTemp['positive'] = self.positive
        normalizedProbValTemp['negative'] = self.negative
        normalizedProbValTemp['joy'] = self.joy
        normalizedProbValTemp['anger'] = self.anger
        normalizedProbValTemp['love'] = self.love
        normalizedProbValTemp['hate'] = self.hate
        normalizedProbValTemp['certainty'] = self.certainty
        normalizedProbValTemp['confusion'] = self.confusion
        normalizedProbValTemp['amusement'] = self.amusement
        normalizedProbValTemp['boredom'] = self.boredom
        normalizedProbValTemp['intensity'] = self.intensity
        normalizedProbValTemp['regret'] = self.regret
        normalizedProbValTemp['agreeable'] = self.agreeable
        normalizedProbValTemp['challenging'] = self.challenging
        normalizedProbValTemp['desire'] = self.desire
        normalizedProbValTemp['calm'] = self.calm
        normalizedProbValTemp['emphatic'] = self.emphatic
        normalizedProbValTemp['sarcastic'] = self.sarcastic
        normalizedProbValTemp['instructive'] = self.instructive
        normalizedProbValTemp['accusative'] = self.accusative
        normalizedProbValTemp['admiration'] = self.admiration
        normalizedProbValTemp['inquisitive'] = self.inquisitive
        normalizedProbValTemp['modest'] = self.modest
        normalizedProbValTemp['pride'] = self.pride
        normalizedProbValTemp['ambivalence'] = self.ambivalence
        normalizedProbValTemp['vulgarity'] = self.vulgarity

        # print("\n\n\t", normalizedProbValTemp)
        self.normalizedProbValues = normalizedProbValTemp
        normalizedProbValTemp = sorted(self.normalizedProbValues.items(),
                                       key=operator.itemgetter(1),
                                       reverse=True)
        self.normalizedProbValues = normalizedProbValTemp
        self.normalizedProbValues = list(self.normalizedProbValues)

        if runningScript == True:
            print("\n\tFinal analysis results: " +
                  str(self.normalizedProbValues))
            self.getInput(_message)
        else:
            print("Final analysis results: " + str(self.normalizedProbValues))
        return self.normalizedProbValues, self.positive, self.negative, self.joy, self.anger, self.love, self.hate, self.certainty, self.confusion, self.amusement, self.boredom, self.intensity, self.regret, self.agreeable, self.challenging, self.desire, self.calm, self.emphatic, self.sarcastic, self.instructive, self.accusative, self.admiration, self.inquisitive, self.modest, self.ambivalence, self.vulgarity
Ejemplo n.º 39
0
    def initialTrain(self):

        # For interchangable corpuses.. uncomment line below
        # selectedCorpus = input('\n\tEnter the name of the corpus file to load (Press enter to load default, from base_corpus.py): ')
        global defaultCorpus
        global pickledOn
        global fullCount

        # if selectedCorpus != defaultCorpus and selectedCorpus != "":
        # defaultCorpus = selectedCorpus
        # elif selectedCorpus == "":
        # defaultCorpus = defaultCorpus
        # else:
        # defaultCorpus = "base_corpus.txt"
        selectedCorpus = defaultCorpus

        try:
            dir = os.path.abspath(os.path.dirname(__file__))
            path = os.path.join(dir, 'data', 'base_corpus.pickle')
            with open(path, 'rb') as fp:
                size = os.path.getsize(path)
                if size > 0:
                    pickledOn = True
                    print("\n\tPickled data found!")
                else:
                    pass
                fp.close()
        except IOError as err:
            pickledOn = False
            dir = os.path.abspath(os.path.dirname(__file__))
            path = os.path.join(dir, 'data', 'base_corpus.pickle')
            print(
                "\n\tNo pickled data found.. now creating and loading pickle.."
            )

        # Training data
        if pickledOn == False:

            # Code below takes training data from text file input
            path = os.getcwd()
            path = os.path.join(path, 'data', selectedCorpus)
            data = codecs.open(path, 'r', encoding='utf-8').read().splitlines()

            # self.train = data # Getting errors with reading base_corpus.txt into list

            fullDatabase = str(self.train)
            countPositive = fullDatabase.count("'positive')", 0,
                                               len(fullDatabase))
            countNegative = fullDatabase.count("'negative')", 0,
                                               len(fullDatabase))
            countLove = fullDatabase.count("'love')", 0, len(fullDatabase))
            countHate = fullDatabase.count("'hate')", 0, len(fullDatabase))
            countJoy = fullDatabase.count("'joy')", 0, len(fullDatabase))
            countAnger = fullDatabase.count("'anger')", 0, len(fullDatabase))
            countCertainty = fullDatabase.count("'certainty'", 0,
                                                len(fullDatabase))
            countConfusion = fullDatabase.count("'confusion'", 0,
                                                len(fullDatabase))
            countAmusement = fullDatabase.count("'amusement'", 0,
                                                len(fullDatabase))
            countBoredom = fullDatabase.count("'boredom'", 0,
                                              len(fullDatabase))
            countIntensity = fullDatabase.count("'intensity'", 0,
                                                len(fullDatabase))
            countRegret = fullDatabase.count("'regret'", 0, len(fullDatabase))
            countAgreeable = fullDatabase.count("'agreeable'", 0,
                                                len(fullDatabase))
            countChallenging = fullDatabase.count("'challenging'", 0,
                                                  len(fullDatabase))
            countDesire = fullDatabase.count("'desire'", 0, len(fullDatabase))
            countCalm = fullDatabase.count("'calm'", 0, len(fullDatabase))
            countEmphatic = fullDatabase.count("'emphatic'", 0,
                                               len(fullDatabase))
            countSarcastic = fullDatabase.count("'sarcastic'", 0,
                                                len(fullDatabase))
            countInstructive = fullDatabase.count("'instructive'", 0,
                                                  len(fullDatabase))
            countAccusative = fullDatabase.count("'accusative'", 0,
                                                 len(fullDatabase))
            countAdmiration = fullDatabase.count("'admiration'", 0,
                                                 len(fullDatabase))
            countInquisitive = fullDatabase.count("'inquisitive'", 0,
                                                  len(fullDatabase))
            countModest = fullDatabase.count("'modest'", 0, len(fullDatabase))
            countPride = fullDatabase.count("'pride'", 0, len(fullDatabase))
            countAmbivalence = fullDatabase.count("'ambivalence'", 0,
                                                  len(fullDatabase))
            countVulgarity = fullDatabase.count("'vulgarity'", 0,
                                                len(fullDatabase))

            fullCount = "\n\tNumbers and types of classifications in loaded database: \n"+ "\t\tPositive: " + str(countPositive) + "\t" + "Negative: " + str(countNegative) + \
            "\t\tJoy: " + str(countJoy) + "\t\t" + "Anger: " + str(countAnger) + "\t\tCertainty: " + str(countCertainty) + "\t" + "Confusion: " + str(countConfusion) + \
            "\t\tCertainty: " + str(countCertainty) + "\t" + "Confusion: " + str(countConfusion) + "\t\tAmusement: " + str(countAmusement) + "\t" + "Boredom: " + str(countBoredom) + \
            "\t\tIntensity: " + str(countIntensity) + "\t" + "Regret: " + str(countRegret) + "\t\tAgreeable: " + str(countAgreeable) + "\t" + "Challenging: " + str(countChallenging) + \
            "\t\tDesire: " + str(countDesire) + "\t" + "Calm: " + str(countCalm) + "\t\tEmphatic: " + str(countEmphatic) + "\t" + "Sarcastic: " + str(countSarcastic) + \
            "\t\tInstructive: " + str(countInstructive) + "\t" + "Accusative: " + str(countAccusative) + "\t\tAdmiration: " + str(countAdmiration) + "\t" + "Inquisitive: " + str(countInquisitive) + \
            "\t\tAdmiration: " + str(countAdmiration) + "\t" + "Inquisitive: " + str(countInquisitive) + "\t\tAmbivalence: " + str(countAmbivalence) + "\t" + "Vulgarity: " + str(countVulgarity)
            "\t\tJoy: " + str(countJoy) + "\t\t" + "Anger: " + str(countAnger) + "\t\tCertainty: " + str(countCertainty) + "\t" + "Confusion: " + str(countConfusion) + \
            "\t\tCertainty: " + str(countCertainty) + "\t" + "Confusion: " + str(countConfusion) + "\t\tAmusement: " + str(countAmusement) + "\t" + "Boredom: " + str(countBoredom) + \
            "\t\tIntensity: " + str(countIntensity) + "\t" + "Regret: " + str(countRegret) + "\t\tAgreeable: " + str(countAgreeable) + "\t" + "Challenging: " + str(countChallenging) + \
            "\t\tDesire: " + str(countDesire) + "\t" + "Calm: " + str(countCalm) + "\t\tEmphatic: " + str(countEmphatic) + "\t" + "Sarcastic: " + str(countSarcastic) + \
            "\t\tInstructive: " + str(countInstructive) + "\t" + "Accusative: " + str(countAccusative) + "\t\tAdmiration: " + str(countAdmiration) + "\t" + "Inquisitive: " + str(countInquisitive) + \
            "\t\tAdmiration: " + str(countAdmiration) + "\t" + "Inquisitive: " + str(countInquisitive) + "\t\tAmbivalence: " + str(countAmbivalence) + "\t" + "Vulgarity: " + str(countVulgarity)

            print(
                """\n\tNumbers and types of classifications in database to be loaded: \n"""
            )
            print("\t\tPositive: " + str(countPositive) + "\t" + "Negative: " +
                  str(countNegative))
            print("\t\tLove: " + str(countLove) + "\t\t" + "Hate: " +
                  str(countHate))
            print("\t\tJoy: " + str(countJoy) + "\t\t" + "Anger: " +
                  str(countAnger))
            print("\t\tCertainty: " + str(countCertainty) + "\t" +
                  "Confusion: " + str(countConfusion))
            print("\t\tAmusement: " + str(countAmusement) + "\t" +
                  "Boredom: " + str(countBoredom))
            print("\t\tIntensity: " + str(countIntensity) + "\t" + "Regret: " +
                  str(countRegret))
            print("\t\tAgreeable: " + str(countAgreeable) + "\t" +
                  "Challenging: " + str(countChallenging))
            print("\t\tDesire: " + str(countDesire) + "\t" + "Calm: " +
                  str(countCalm))
            print("\t\tEmphatic: " + str(countEmphatic) + "\t" +
                  "Sarcastic: " + str(countSarcastic))
            print("\t\tInstructive: " + str(countInstructive) + "\t" +
                  "Accusative: " + str(countAccusative))
            print("\t\tAdmiration: " + str(countAdmiration) + "\t" +
                  "Inquisitive: " + str(countInquisitive))
            print("\t\tModest: " + str(countModest) + "\t" + "Pride: " +
                  str(countPride))
            print("\t\tAmbivalence: " + str(countAmbivalence) + "\t" +
                  "Vulgarity: " + str(countVulgarity))

            print("\n\tOpening training data.")
            # print(str(type(self.train)))
            # print(str(self.train))

            random.seed(1)
            random.shuffle(self.train)
            self.cl = NaiveBayesClassifier(self.train)
            print("\n\tTraining now..")

            # shelvedData["base"] = cl # SHELF vs PICKLE

            dir = os.path.abspath(os.path.dirname(__file__))
            path = os.path.join(dir, 'data', 'base_corpus.pickle')

            fp = open(path, 'wb')
            print("\n\tLoaded training data into pickle file.")
            pickle.dump(self.cl, fp, protocol=pickle.HIGHEST_PROTOCOL)
            fp.close()
            print(
                "\n\tPickling complete, and will be loaded as the default database corpus next time, skipping the training period."
            )
            currentTime = datetime.datetime.now().time()
            print("\n\n\tTIME NEW DATABASE FINISHED TRAINING AND SAVING: ",
                  currentTime)
            # shelvedData.close() # SHELF vs PICKLE
        if pickledOn == True:
            try:
                # shelvedData = shelve.open("base_corpus.dat") # SHELF VS PICKLE
                path = os.getcwd()
                path = os.path.join(path, 'data', 'base_corpus.pickle')
                fp = open(path, 'rb')
                # path = os.getcwd()
                # path = os.path.join(path, 'data', 'base_corpus.pickle')
                fp = open(path, "rb")
                self.cl = pickle.load(fp)
                fp.close()
                print("\n\tTraining has been loaded from the selected corpus.")
                print("\t\t" + fullCount)
            except IOError as err:
                print(
                    "\n\tError training pickle file.. system will exit. Go into the directory, delete the corrupt pickle file, and retry this script to train a new copy."
                )
                print("\n\tPath was at:", path)
                sys.exit()
            pass
        if runningImport == True:
            self.runAnalysis(_message)
        else:
            self.getInput(_message)
Ejemplo n.º 40
0

positive = tupling(pos, 'pos', 'text')
negative = tupling(neg, 'neg', 'text')
training = []

for i in positive:
    training.append(i)

for i in negative:
    training.append(i)

random.seed(123)
random.shuffle(training)

cl = NaiveBayesClassifier(training)

p1 = pos[10001:15001]
p2 = neg[10001:15001]

large_set = pd.concat([p1, p2])

testing = []

for i in large_set['text']:
    testing.append(i)

fin_sent = []
for i in testing:
    blob = TextBlob(i, classifier=cl)
    for s in blob.sentiment:
C02 = 'Virus Diseases'

trainData = [
    (('Augmentation mentoplasty using Mersilene mesh.  Many different materials are available for augmentation mentoplasty.  However, the optimal implant material for chin implantation has yet to be found.  During the past several years, a number of experienced surgeons have turned to the use of Mersilene mesh.  Mersilene mesh is a non-absorbable Dacron polyester fiber that can be conformed easily into layers to achieve tailored dimensions and shape.  At the McCollough Plastic Surgery Clinic PA, Birmingham, Ala, 277 patients over a 10-year period underwent chin augmentation with Mersilene mesh implants.  The material provides excellent tensile strength, durability, and surgical adaptability.  The overall complication rate was 3.2% (nine patients); infection rate, 2.5% (seven patients); and removal secondary to infection, 1.7% (five patients).  Based on this 10-year experience, Mersilene mesh remains our material of choice for chin augmentation.'
      ), C01),
    ((' Multiple intracranial mucoceles associated with phaeohyphomycosis of the paranasal sinuses.  The purpose of this article is to alert clinicians to a new pathogenic fungus of the paranasal sinuses called Exserohilum rostratum.  Exserohilum species are one of the etiologic agents of phaeohyphomycosis, a constellation of entities caused by dematiaceous fungi.  This class of fungal sinus infection has emerged only in the past decade; it occurs primarily in immunocompetent individuals and produces a tenacious, progressive pansinusitis.  To our knowledge, this study describes the first case of multiple intracranial mucoceles secondary to E rostratum.  The diagnostic workup includes computed tomography and magnetic resonance imaging followed by direct microscopic examination of tissue biopsy specimens.  A craniotomy followed by a bilateral external ethmoidectomy was necessary for complete extirpation of the infected mucoceles.  Aggressive surgical management of this mycotic infection is described. '
      ), C01),
    (('Laser photodynamic therapy for papilloma viral lesions.  Photodynamic therapy was tested for its therapeutic efficacy in eradicating rabbit papilloma warts. The wild-type viral warts suspension was used to induce treatable papilloma warts in the cutaneous tissue of Dutch Belted rabbits. The photosensitizing agents used intravenously were Photofrin II at 10 mg/kg of body weight and Chlorin e6 monoethylene diamine monohydrochloric acid (Chlorin e6 med HCl) at 1 mg/kg of body weight.  The lasers used were an argon-dye laser at 628 and 655 nm and a gold vapor laser at 628 nm.   The irradiances of 25 to 180 mW/cm2 were applied topically with an end-on lens optical fiber with total radiant doses of 7.5 to 54 J/cm2.  Photofrin II and the argon-dye laser at the highest light dosage (54 J/cm2) and Chlorin e6 monoethylene diamine monohydrochloride administered 2 hours before argon-dye laser irradiation at 655 nm at the highest light dosage (54 J/cm2) produced wart regression.  Total wart regression without recurrence was achieved with Photofrin II and the gold vapor laser at all light dosages.  The difference observed between the argon-dye laser and the gold vapor laser might be explained by the pulsed nature of the gold vapor laser, with its high-peak powers, some 5000 x the average measured light dose.  In this model, the smaller, less cornified lesions were more effectively treated with photodynamic therapy.'
      ), C02),
    (('Role of the monocyte-macrophage in influenza virus infection of lymphocytes: implications for HIV infection.  Knowledge of the pathogenesis of viruses which are less virulent than human immunodeficiency virus (HIV) may provide valuable insights into the pathogenesis of HIV infection.  Influenza virus, an enveloped RNA virus, infects monocyte-macrophages, although the infection is brief and abortive.  Isolated purified lymphocytes are completely resistant to infection.  In contrast, mixtures of lymphocytes and macrophages can synthesize all virus proteins.  Infection requires physical association of monocyte-macrophages and lymphocytes in "clusters." These studies with influenza virus suggest that the pathogenesis of virus infections in mixed cell cultures may be very different from that observed in purified cell populations, and they suggest that similar studies should be performed with HIV. '
      ), C02),
    (('Use of polymerase chain reaction for successful identification of asymptomatic genital infection with herpes simplex virus in pregnant women at delivery.  The polymerase chain reaction was adapted to the amplification of a herpes simplex virus (HSV) DNA sequence, common to HSV types 1 and 2 (HSV-1, HSV-2).  The amplified product was detectable by ethidium-bromide staining or Southern hybridization of gels and by dot hybridization.  The HSV polymerase chain reaction detected HSV DNA in samples obtained from eight patients with genital lesions from which HSV-2 was isolated in tissue culture and from four patients with labial lesions from which HSV-1 was isolated.  The HSV polymerase chain reaction identified HSV in clinical specimens obtained from 11 women who had asymptomatic genital HSV infections at delivery.  None of 11 samples obtained at delivery from women who had antibodies to HSV-2, but whose delivery cultures were negative, were positive by polymerase chain reaction and no false-positive reactions were obtained when the reaction mixture contained human cell DNA or varicella-zoster virus, cytomegalovirus, Epstein-Barr virus, or human papillomavirus DNA.'
      ), C02)
]

classifierModel = NaiveBayesClassifier(trainData)

a = 'A school blood drive before a measles outbreak permitted correlation of preexposure measles antibody titers with clinical protection using the plaque reduction neutralization (PRN) test and an EIA.'

print(classifierModel.classify('a'))

#print(category)
#print(content)

testData = [
    'Measles antibody: reevaluation of protective titers.',
    'A school blood drive before a measles outbreak permitted correlation of preexposure measles antibody titers with clinical protection using the plaque reduction neutralization (PRN) test and an EIA.',
    'Of 9 donors with detectable preexposure PRN titer less than or equal to 120, 8 met the clinical criteria for measles (7 seroconfirmed) compared with none of 71 with preexposure PRN titers greater than 120 (P less than .0001)',
    'Seven of 11 donors with preexposure PRN titers of 216-874 had a greater than or equal to 4-fold rise in antibody titer (mean, 43-fold) compared with none of 7 with a preexposure PRN titer greater than or equal to 1052 (P less than .02).',
    'Of 37 noncases with preexposure PRN titer less than 1052, 26 (70%) reported one or more symptoms compared with 11 (31%) of 35 donors with preexposure PRN titers greater than or equal to 1052 (P less than .002).',
    'By EIA, no case had detectable preexposure antibody; the preexposure geometric mean titer of asymptomatic donors (220) was not significantly higher than that of symptomatic donors who did not meet the clinical criteria for measles (153) (P = .10).',
Ejemplo n.º 42
0
        dt = pd.to_datetime(line[2], format='%Y-%m-%d %H:%M:%S')
        if dt > (pd.to_datetime(endtime, format='%Y-%m-%d %H:%M') -
                 pd.to_timedelta('0 days 06:00:00')) and dt < pd.to_datetime(
                     endtime, format='%Y-%m-%d %H:%M'):
            text.append(line[1])
        elif dt > pd.to_datetime(endtime, format='%Y-%m-%d %H:%M'):
            break

    print "Classifying tweets..."

    sentiment = 0.
    n = 0.
    pos = 0.
    neg = 0.
    with open('train-' + str(sys.argv[1]) + '.csv', 'r') as fp:
        cl = NaiveBayesClassifier(fp, format="csv")
    for line in text:
        newline = line.decode('utf-8')
        prob_dist = cl.prob_classify(newline)
        line_sent = prob_dist.max()
        n += 1
        if line_sent == ' pos':
            pos += 1
            sentiment += 1
        elif line_sent == ' neg':
            neg += 1
            sentiment -= 1
    print "Sentiment: ", sentiment
    print "N: ", n
    print "pos: ", pos
    print "neg: ", neg
Ejemplo n.º 43
0
for prefix in ["ind", "chi", "fre", "gre", "mex", "ita", "tha"]:
    train = []
    test = []
    with open(prefix + 'TrainData.csv', "rb") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            train.append((row[0] + ": " + row[2], row[3]))

    with open(prefix + 'TestData.csv', "rb") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            test.append((row[0] + ": " + row[2], row[3]))

    print("Read data for " + prefix)

    cl = NaiveBayesClassifier(train)
    pickle.dump(cl, open(prefix + "Classifier.pkl", "wb"))

    #Compute accuracy
    print "Model trained for " + prefix + ". Accuracy:" + str(
        cl.accuracy(test))

    print "Most informative features for " + prefix + ":"
    # Show 100 most informative features
    cl.show_informative_features(50)
    classifiers.append(cl)

print "Trained all classifiers, loading untagged data."

full = []
with open('notTaggedData.csv', "rb") as csvfile:
Ejemplo n.º 44
0
class ArticleTextAnalyzer:
    """Generic class to classify news articles and return most common words and phrases."""
    SENTENCES_CLASSIFIED_PER_ARTICLE = 50
    MAX_PROCESS_WORKERS = cpu_count(logical=False)
    STOP_WORDS = stopwords.words('english')
    STOP_WORDS.extend([
        'democrat', 'democrats', 'republican', 'republicans'
    ])

    def __init__(self, art_dict, search_term=''):
        self.article_blobs_dict = self.create_clean_sentences(art_dict, search_term)
        self.classifier = NaiveBayesClassifier(self.generate_training_data(self.article_blobs_dict))
        print('finished training classifier')

    @staticmethod
    def read_news_scraper_output_file_into_dict(filepath) -> dict:
        """
        Reads the CSV output from the NewsScraper class into a dictionary.

        :param filepath: File path of the CSV.
        :type filepath: str
        :return: Returns a dictionary of the form {site: list of articles}
        """
        with open(filepath) as f:
            reader = csv.DictReader(f, delimiter='|')
            prev_site = ''
            output_dict = {}
            for row in reader:
                if row['site'] == prev_site:
                    output_dict[row['site']].append(row['article'])
                else:
                    prev_site = row['site']
                    output_dict[row['site']] = [row['article']]
            return output_dict

    @staticmethod
    def generate_training_data(art_dict) -> List[tuple]:
        """
        Creates training data for classifier from article dict based on site classifications in config.SITE_CATEGORIES

        :param art_dict: dict of the form {site: {sentences: list, subjectivity: list}}
        :type art_dict: dict
        :return: list of tuples to use as training data for classifier
        """

        sentence_cl = {cl: [] for cl in SITE_CATEGORIES}
        for site, sentence_analysis in art_dict.items():
            for cl, site_set in SITE_CATEGORIES.items():
                if site in site_set:
                    sentence_cl[cl].extend([(sentence, cl) for sentence in sentence_analysis['sentences']])
        max_articles = min(list(map(len, sentence_cl.values())))
        for cl in sentence_cl:
            sentence_cl[cl] = random.sample(sentence_cl[cl], max_articles)
        print('Number of sentences used to train classifier by type: ' + str(max_articles))

        return [tup for tup_list in sentence_cl.values() for tup in tup_list]

    @staticmethod
    def create_clean_sentences(article_dict, search_term) -> dict:
        """
        Wrapper for multiprocessing cleaning each article and creating sentences for classification

        :param search_term: word or phrase being searched
        :type search_term: str
        :param article_dict: dict of the form {site: list}
        :type article_dict: dict
        :return: dict of sites with dict values that contain cleaned and lemmatized sentences and subjectivity
        """
        clean_dict = {site: {'sentences': [], 'subjectivity': []} for site in article_dict}
        with ProcessPoolExecutor(max_workers=ArticleTextAnalyzer.MAX_PROCESS_WORKERS) as ex:
            futures = {ex.submit(ArticleTextAnalyzer.clean_text_blob, search_term,
                                 TextBlob(article), site, article_list.index(article), len(article_list)): site
                       for site, article_list in article_dict.items() for article in article_list}
            for f in as_completed(futures):
                for key, value in f.result().items():
                    clean_dict[futures[f]][key].extend(value)

        return clean_dict

    @staticmethod
    def clean_text_blob(search_term, art_tb, site, index, num_articles) -> dict:
        """
        Accepts articles as TextBlobs and breaks them up into sentences, removes stop words and lemmatizes,
        and measures subjectivity

        :param search_term: word or phrase being searched - will be excluded from final text
        :type search_term: str
        :param art_tb: TextBlob of a full article
        :type art_tb: TextBlob
        :param site: website key
        :type site: str
        :param index: index of article in list of articles for site
        :type index: int
        :param num_articles: total articles scraped for site
        :type num_articles: int
        :return: dict of the form {'sentences': list, 'subjectivity': list}
        """
        print(site + ': cleaning article ' + str(index + 1) + ' of ' + str(num_articles) + '.')
        clean_article = []
        subjectivity = []
        for sentence in art_tb.sentences[:ArticleTextAnalyzer.SENTENCES_CLASSIFIED_PER_ARTICLE]:
            if sentence.raw.find(SITE_SPIDER_CONFIG[site]['full_name']) > -1:
                continue
            clean_sentence = ''
            for word, pos in sentence.pos_tags:
                if word.lower() not in ArticleTextAnalyzer.STOP_WORDS and word not in ArticleTextAnalyzer.STOP_WORDS:
                    if word.lower() in TextBlob(search_term).words.lower():
                        continue
                    elif pos == 'NNP' or word == 'Crow':
                        clean_sentence += ' ' + word
                    elif pos == 'NNPS':
                        clean_sentence += ' ' + word.singularize()
                    else:
                        clean_sentence += ' ' + word.lemmatize().lower()
            clean_article.append(clean_sentence.strip())
            subjectivity.append(sentence.sentiment.subjectivity)
        print(site + ': finished cleaning article ' + str(index + 1) + ' of ' + str(num_articles) + '.')

        return {'sentences': clean_article, 'subjectivity': subjectivity}

    @staticmethod
    def generate_wordclouds(data, axes) -> None:
        """
        Create a word cloud for each bias and assign them to plot axes

        :param data: dict of the form {site: {sentences: list, subjectivity: list}}
        :type data: dict
        :param axes: axes of matplotlib grid spec
        :type axes: list
        :return: None
        """
        cloud_strings = {bias: '' for bias in SITE_CATEGORIES}
        for key, sa in data.items():
            for bias, site_set in SITE_CATEGORIES.items():
                if key in site_set:
                    cloud_strings[bias] += ' ' + ' '.join([sentence for sentence in sa['sentences']])

        colors1 = [(1, 1, 1), (0, 0.48, 1)]
        colors2 = [(1, 1, 1), (1, 0.13, 0.13)]
        colormaps = [
            LinearSegmentedColormap.from_list(name='cm1', colors=colors1, N=100),
            LinearSegmentedColormap.from_list(name='cm1', colors=colors2, N=100)
        ]

        for axis, (bias, cloud_string), cm in zip(axes, cloud_strings.items(), colormaps):
            wc = WordCloud(
                background_color='black',
                max_words=100,
                collocations=True,
                colormap=cm
            ).generate_from_frequencies(TextBlob(cloud_string).np_counts)
            axis.imshow(wc)
            axis.axis('off')
            axis.set_title(bias.capitalize(), color='white', fontsize=16)

    def classify_nonpartisan_articles(self) -> dict:
        """
        Wrapper for multiprocessing article classification

        :return: Dictionary of the form {site: [classification(article)]
        """
        classifications = {}
        with ProcessPoolExecutor(max_workers=ArticleTextAnalyzer.MAX_PROCESS_WORKERS) as ex:
            for site, cl_dict in zip(self.article_blobs_dict.keys(), ex.map(self.calculate_means,
                                                                            self.article_blobs_dict.items())):
                classifications[site] = cl_dict

        return classifications

    def calculate_means(self, blobs_dict_item) -> dict:
        """
        Calculate the mean bias and subjectivity of the site

        :param blobs_dict_item: tuple of the form (site, {sentences: list, subjectivity: list})
        :type blobs_dict_item: tuple
        :return: dict of the form {'bias': double, 'subjectivity': double}
        """
        site, sentence_analysis = blobs_dict_item
        print('calculating mean bias and subjectivity for ' + site)
        return {
            'bias': mean([self.classifier.prob_classify(sentence).prob('conservative')
                          for sentence in sentence_analysis['sentences']]),
            'subjectivity': mean(sentence_analysis['subjectivity'])
        }

    def plot_output(self, cl) -> None:
        """
        Create visualizations for text data analysis

        :param cl: dict of the form {'bias': double, 'subjectivity': double}
        :type cl: dict
        :return: None
        """
        # create dataframe for scatter plot
        plot_df = pd.DataFrame.from_dict(cl)
        plot_df = plot_df.transpose().reset_index()
        plot_df.rename(columns={'index': 'site'}, inplace=True)
        plot_df['full_name'] = plot_df['site'].apply(lambda x: SITE_SPIDER_CONFIG[x]['full_name'])
        print(plot_df)

        # scatter plot styling
        sns.set_style('ticks', SCATTER_PLOT_STYLE_PARAMS)

        # layout for output
        fig = plt.figure(constrained_layout=True)
        gs = fig.add_gridspec(2, 2)
        ax1 = fig.add_subplot(gs[0, :])
        ax1.spines['right'].set_visible(False)
        ax1.spines['top'].set_visible(False)
        ax2 = fig.add_subplot(gs[1, 0])
        ax3 = fig.add_subplot(gs[1, 1])

        # create scatter plot with point labels and assign to top row of layout
        sns.scatterplot(
            data=plot_df,
            x='bias',
            y='subjectivity',
            hue='bias',
            ax=ax1,
            legend=None,
            palette=sns.color_palette('coolwarm', as_cmap=True)
        )
        annotations = [
            ax1.annotate(
                text=plot_df.full_name[i],
                xy=(plot_df.bias[i], plot_df.subjectivity[i]),
                textcoords='offset points',
                xytext=(0, 5),
                ha='center',
                fontsize=8,
                color='white'
            )
            for i in range(plot_df.shape[0])
        ]
        adjust_text(annotations)
        # for i in range(plot_df.shape[0]):
        #     ax1.annotate(
        #         text=plot_df.full_name[i],
        #         xy=(plot_df.bias[i], plot_df.subjectivity[i]),
        #         textcoords='offset points',
        #         xytext=(0, 5),
        #         ha='center',
        #         fontsize=8,
        #         color='white'
        #     )

        # create word clouds and assign to bottom two slots in layout
        self.generate_wordclouds(
            data=self.article_blobs_dict,
            axes=[ax2, ax3]
        )

        plt.show()
Ejemplo n.º 45
0
    def __init__(self,
                 message="",
                 pre_result="",
                 prob_dist=0,
                 prob_dist_max=0,
                 positive=0,
                 negative=0,
                 joy=0,
                 anger=0,
                 love=0,
                 hate=0,
                 certainty=0,
                 boredom=0,
                 intensity=0,
                 regret=0,
                 challenging=0,
                 agreeable=0,
                 desire=0,
                 calm=0,
                 sarcastic=0,
                 emphatic=0,
                 pride=0,
                 accusative=0,
                 admiration=0,
                 inquisitive=0,
                 modest=0,
                 instructive=0,
                 ambivalence=0,
                 vulgarity=0,
                 train=[],
                 cl=NaiveBayesClassifier([]),
                 punctCountDict={},
                 wordCount=0,
                 sentenceCount=0,
                 normalizedProbValues={},
                 sentences=[],
                 sentencesProbValues=[],
                 massResults=[]):

        # COPY AND PASTE ALL OF BASE_CORPUS.TXT INTO SELF.TRAIN BELOW FOR TRAINING NEW MODELS
        # When reading base_corpus into list for training (in initialTrain function, we run into errors with escaped chars)
        self.train = []

        self.message = message
        self.punctCountDict = punctCountDict
        self.wordCount = wordCount
        self.sentenceCount = sentenceCount

        self.pre_result = pre_result
        self.prob_dist = prob_dist
        self.prob_dist_max = prob_dist_max

        self.positive = positive
        self.negative = negative
        self.joy = joy
        self.anger = anger
        self.love = love
        self.hate = hate
        self.certainty = certainty
        self.boredom = boredom
        self.intensity = intensity
        self.regret = regret
        self.challenging = challenging
        self.agreeable = agreeable
        self.desire = desire
        self.calm = calm
        self.sarcastic = sarcastic
        self.emphatic = emphatic
        self.pride = pride
        self.accusative = accusative
        self.admiration = admiration
        self.inquisitive = inquisitive
        self.modest = modest
        self.instructive = instructive
        self.ambivalence = ambivalence
        self.vulgarity = vulgarity

        self.prob_dist = prob_dist
        self.prob_dist_max = prob_dist_max
        self.cl = cl
        self.normalizedProbValues = normalizedProbValues
        self.sentences = sentences
        self.sentencesProbValues = sentencesProbValues
        self.massResults = massResults
Ejemplo n.º 46
0
def save_object(obj, filename):
    with open(filename, 'wb') as output:
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)


arguments = sys.argv

if len(arguments) < 2:
    print(
        "This is a command-line utility that uses the following arguments: output_name"
    )
    raise Exception("No arguments found")

output_name = str(arguments[1])

train = [('I love this sandwich.', 'pos'),
         ('this is an amazing place!', 'pos'),
         ('I feel very good about these beers.', 'pos'),
         ('this is my best work.', 'pos'), ("what an awesome view", 'pos'),
         ('I do not like this restaurant', 'neg'),
         ('I am tired of this stuff.', 'neg'),
         ("I can't deal with this", 'neg'), ('he is my sworn enemy!', 'neg'),
         ('my boss is horrible.', 'neg')]

new_classifier = NaiveBayesClassifier(train, output='csv')

save_object(new_classifier, '{0}.pkl'.format(output_name))

print("Blank classifier created")
)
stopwords = []
for word in input_stopwords:
    if word.endswith('\n'):
        word = word[:-1]
        stopwords.append(word)

# prepare training and test data
create_json_file("training_set", "training.json")
categories, sentences = prepare_test_data("test_set")

# Bayes Classifier
print("Training Naive Bayes Classifier...")
start_nbc = time.time()
with open('training.json', 'r') as training:
    nbc = NaiveBayesClassifier(training, format="json")
stop_nbc = time.time()
print("Training Naive Bayes Classifier completed...")
elapsed = stop_nbc - start_nbc
print("Training time (in seconds): " + str(elapsed))
print("Testing Naive Bayes Classifier...")
correct = 0
start_nbc = time.time()
for i in range(0, len(sentences)):
    category = str(nbc.classify(sentences[i])).lower()
    expected = str(categories[i]).lower()
    if category == expected:
        correct += 1
stop_nbc = time.time()
elapsed = stop_nbc - start_nbc
print("Number of tests: " + str(len(sentences)))
Ejemplo n.º 48
0
with open ("articles/boston/fox.txt", "r") as myfile:
    data=myfile.read().replace('\n', '')



train = [
('I love this sandwich.', 'pos'),
('This is an amazing place!', 'pos'),
('I feel very good about these beers.', 'pos'),
('I do not like this restaurant', 'neg'),
('kill', 'neg'),
("murder", 'neg'),
("terrorist", "neg")
]
cl = NaiveBayesClassifier(train)
print cl.classify("kill murder death suicide terrorist")

#g = TextBlob(data1, analyzer = NaiveBayesAnalyzer())
#print g.sentiment


cleanData= data.decode('utf-8')
test = TextBlob(cleanData, analyzer = NaiveBayesAnalyzer())

for s in test.sentences:
	s1=str(s)
	blob = TextBlob(s1, analyzer = NaiveBayesAnalyzer())
	print 's.classsssify', blob.sentiment
	test = TextBlob(s1)
	print 'naive bayes yields', test.sentiment
Ejemplo n.º 49
0
def final_utterance_appreciation_analysis(final_utterance):
	"""
	Input: A list of final utterances by the user.
	Output: The percentage of the people expressing appreciation at the end of the conversation.

	Algorithm:
	1. Create a training set and a validation set of conversation which are manually classified into "appreciation" and "nonappreciation"
	   The differentiation criteria is based on the existence of the words of gratitude.
	2. Train the Naive Bayesian classifier algorithm using the training set.
	3. If the accuracy of the classifier algorithm in classifying the validation dataset into "appreciation" and "nonappreciation",
	   apply the algorithm to all the list final_utterance using a for loop.
	4. Use a dictionary data structure during the loop to store the number of people who express gratitude and who do not express gratitude.
	5. Calculate the percentage of people who express gratitude.

	How the Native Bayesian Classifier Algorithm from TextBlob Package Works:

	For training dataset:
	In order to find the probability for classifying the sentence with a label of "appreciation" and "nonappreciation",
	the algorithm first removes all the meaningless stop words such as "the" and "a" in the sentence.
	Then it calculates the frequency of the remaining tokens and creates a likelihood table that maps the tokens (which are the features)
	to the probability of the token being labelled as "appreciation" and "nonappreciation".

	For a new sentence, it removes all the meaningless stop words and calculate the probability of the sentence being "appreciation"
	or "nonappreciation" based on the 'naive' assumption that all features are independent, given the label:
	|                       P(label) * P(f1|label) * ... * P(fn|label)
	|  P(label|features) = --------------------------------------------
	|                                         P(features)

	"""

	classified_dict = {"appreciation": 0, "non-appreciation": 0}

	train = [('Very well. How about the price for the trip to Essen?', 'nonappreciation'),
	         ("I'd like to book the Cairo package. Thank you!", 'appreciation'),
	         ('oh heck yeah!! economy - I need the money', 'nonappreciation'),
	         ('Then I will take it!', 'nonappreciation'),
	         ('Awesome!!! Thanks!!!', 'appreciation'),
	         ('What??? :disappointed:', 'nonappreciation'),
	         ('Yes do that', 'nonappreciation'),
	         ('Thank you kindly!', 'appreciation'),
	         ('Ok, thank you for your time anyways', 'appreciation'),
	         ('thank you very much for your patience you are an absolute gem','appreciation'),
	         ('Thank you so much!', 'appreciation'),
	         ('Lots of swanky hotels to choose from! Well, based on length of trip, that one to SL sounds like a great deal. I think I wanna go ahead with booking that', 'nonappreciation'),
	         ('Uh huh', 'nonappreciation'),
	         ('Jerusalem to Kingston. I swear if I have to repeat myself again then I will sue', 'nonappreciation'),
	         ('Ok, thanks anyway','appreciation'),
	         ('Looking to go from San Francisco to MArseille. ', 'nonappreciation'),
	         ('Book me for September 18 to 22. Let me know if its more than 2800 because thats all I can afford', 'nonappreciation'),
	         ('duuuude. ah\nwhat about Ciudad Juarez', 'nonappreciation'),
	         ('Well what if I leave the 8th', 'nonappreciation'),
	         ('Ok :+1: we out', 'nonappreciation'),
	         ('Yes!!!!!', 'nonappreciation'),
	         ('ok fine lets do it, business class please', 'nonappreciation'),
	         ('WOE IS ME, FOR I HAVE NOT', 'nonappreciation'),
	         ('ah damn', 'nonappreciation'),
	         ('okay bye', 'nonappreciation'),
	         ('Yikes. Ok Buenos Aires it is\nBook it please\nBusiness class', 'nonappreciation'),
	         ('shit yassss we goin in. Book it for us, please.', 'nonappreciation'),
	         ('well, this is rather disappointing we cannot spend our family vacation near the airport. i wont be booking anything today in this case, goodbye', 'nonappreciation'),
	         ('Thanks! Very excited!', 'appreciation'),
	         ('NOT GOOD', 'nonappreciation'),
	         ("you're a lifesaver", "appreciation"),
	         ('ah. if i could book, i would book this one. well thanks for your time, ill come back next year and save my vacation days for a trip to San Diego.', "appreciation"),
	         ('Great, thanks a lot!', "appreciation"),
	         ("WHAT!?!?! Ugh, kill me now. Okkay fine. I'll look somewhere else.", "nonappreciation"),
	         ("I guess that sound okay, I'll take it", "nonappreciation"),
	         ("Ok, that's fine\nBook it", "nonappreciation"),
	         ('I like the sound of that one. Heart of the city would be better than near a mall.\nLets book business class in Buenos Aires.', "nonappreciation"),
	         ('cool bye', "nonappreciation"),
	         ("let's book :wink:", "nonappreciation"),
	         ('Done, booked! Thanks!', 'appreciation'),
	         ('Okay will consider it and get back to you, thanks!', 'appreciation'),
	         ('DOPE. book it', 'nonappreciation'),
	         ('Hmm. Okay well im just gonna take the information you gave me and discuss it with my wife before booking something she might not enjoy. Thanks for the help!', 'appreciation'),
	         ('Thanks! You were a great help!', 'appreciation'),
	         ('i said 2.5 wasnt good enough', 'nonappreciation'),
	         ('No thats the last straw, we are taking our business elsewhere', 'nonappreciation'),
	         ('Thanks :slightly_smiling_face:', 'appreciation'),
	         ('Hi Do you fly from Ulsan to London??', 'nonappreciation'),
	         ('Ok then leave from Beijing', 'appreciation'),
	         ('i need to get away from a little longer than that one. so lets book vancouver please and thanks', "appreciation"),
	         ("Let's book Valencia. Pleasure doing business with you.", "appreciation"),
	         ('Thank you bot.', "appreciation"),
	         ('No worries, thanks!', "appreciation"),
	         ("That sucks. I'll look somewhere else", "nonappreciation"),
	         ('I am giving you one last time to you your job. you better tread carefully here, my friend,\nCairo to Porto Alegre or I will raise hell', "nonappreciation"),
	         ('Bye. And thanks for nothing.', "nonappreciation"),
	         ("Yes, I'll take it. Thank you", "nonappreciation"),
	         ('no there are 7 of us', "nonappreciation"),
	         ('for 712.00 it sounds like a very nice deal I will book flight on August 26 for 6 days. Thank you for your help.', 'appreciation'),
	         ('3.5 it is then. lets book it', 'nonappreciation'),
	         ('but fine, book it', 'nonappreciation'),
	         ('no can do', "nonappreciation"),
	         ('Thank you very much.', "nonappreciation"),
	         ('gracias!', "appreciation"),
	         ("Perfect! I'll book it", "nonappreciation"),
	         ('Do you do flights leaving from Tel Aviv?', "nonappreciation"),
	         ('that seem good, i will book! Gracias!', "appreciation"),
	         ("No it's alright! thanks though!", "appreciation"),
	         ('okay well its crucial i get there from Fortaleza so I will call someone else', "nonappreciation"),
	         ('how is that possible', "nonappreciation"),
	         ('Well what about in Goiania.?','nonappreciation'),
	         ('ok no thats not good enough im going elsewhere', "nonappreciation"),
	         ('amazing! thanks!', "appreciation"),
	         ('Lets do Business class', "nonappreciation"),
	         ("Oh Okay well i'll look somewhere else. Thanks anyway.", "appreciation"),
	         ('you dont have any flights to birmingham yeah i find that pretty freakin hard to believe', "nonappreciation"),
	         ('This is HORRIBLE', "nonappreciation"),
	         ("yes, you're right.. thank you", "appreciation"),
	         ('ok thanks so much', "appreciation"),
	         ('what if i changed the dates. sept 2 and 23', "nonappreciation"),
	         ('Thank you, but I will go use another service that can better satisfy my escapist fantasies', "appreciation"),
	         ("I really want a spa. If you have nothing to offer with a spa, I'll shop around then.", 'nonappreciation'),
	         ('Oh dear, thats quite above our 3 thousand dollar budget.', 'nonappreciation'),
			 ('dope! thanks', 'appreciation'),
			 ('No worries! Bye!', 'nonappreciation'),
			 ('Ok Lets lock in San Diego', "nonappreciation"),
			 ("You're great", 'appreciation'),
			 ('ok. book it out of Milan please', 'nonappreciation)'),
			 ('ill go for Ciudad Juarez', "nonappreciation"),
			 ('Thank you wozbot!', "appreciation"),
			 ('yes please', "nonappreciation"),
			 ("Usually I wouldn't want to be caught dead in a 3.5 star hotel, but I'm short on time here. Get us on that trip, business class", "nonappreciation"),
			 ('GREAT Thanks!!!!!!!!', "appreciation"),
			 ("I think I'll stick to the 11 day package in Belem at Las Flores, seems like the best deal and it had a good user rating. Let's book that one.", "nonappreciation"),
			 ('thnx', "appreciation"),
			 ('no it HAS to be baltimore and it HAS to be perfect. thanks anyways', "appreciation"),
			 ("Perfect! I'll book it", "nonappreciation"),
			 ("That's it?", "nonappreciation"),
			 ('I shall take the 5 star package!', "nonappreciation"),
			 ('thank you so much', "appreciation"),
			 ('YOU ARE RUINING MY MARRIAGE', "nonappreciation")]

	validation = [('Yes chief', "appreciation"),
				 ("Thanks! I'm sure it will be amazinggg", "appreciation"),
				 ("Weeeelllll this is a no brainer, I 'll just leave the next day and save a whole lotta money! Can you book this for me right away so I don't lose it?", "nonappreciation"),
				 ("Ok I'll book the package with 8 days in Pittsburgh from August 17th to the 24th. Thank you.", "appreciation"),
				 ('Thanks - will do', "appreciation"),
				 ('Killing it! thank', "appreciation"),
				 ('Thanks, you too', "appreciation"),
				 ('thank you wozbot :slightly_smiling_face: toodles', "appreciation"),
				 ('spectacular book please', "nonappreciation"),
				 ("Well, I reckon I'll just book this one.", "nonappreciation"),
				 ("yea so I've heard... send me to Paris then", 'nonappreciation'),
				 ('Fortaleza\n5 stars', "nonappreciation"),
				 ('I guess I can increase my budget by 1000', 'nonappreciation'),
				 ('ok see ya', "nonappreciation"),
				 ('leaving from anywhere??', "nonappreciation"),
				 ("That's it! Thank you so so much :):):)", "appreciation"),
				 ('Done. Book it.', "nonappreciation"),
				 ('Great, sounds perfect. Thank you.', "appreciation"),
				 ('Thats all i had my heart set on!!', "nonappreciation"),
				 ("That sounds like the better hotel. Can't be too cautious travelling by myself for the first time! I will book that deal in an economy class ticket, I'm not ready for business class YET, need to pass that bar exam!",  "nonappreciation"),
				 ('Then I will take my search elsewhere', "nonappreciation"),
				 ('Ya thanks', "appreciation"),
				 ('Thank you, glad to be going back so soon', "appreciation"),
				 ('well okay I can always take the tram in to the city. I will book that one.', "nonappreciation"),
				 ('This is hopeless', "nonappreciation"),
				 ('Great, thank you. I will most certainly book my next vacation with you.', "appreciation"),
				 ('thank youuuu', "appreciation"),
				 ('Lock it down', "nonappreciation"),
				 ("Please help! My lovely parents have been married fof 20 years and they've never taken a trip together. I'm thinking of getting them out of town Sept 6 to 9\nyou got anything good for 2 adults leaving sao paulo, for under 2400?", "nonappreciation"),
				 ('we can also go to Kochi', "nonappreciation"),
				 ('no but we can stay for 9 days instead of 3', "nonappreciation"),
				 ('thanks you!', "appreciation"),
				 ('Just under budget. ok bye now', "nonappreciation"),
				 ('thankyou', "appreciation"),
				 ('can you tell me the price and nearby attractions?', "nonappreciation"),
				 ('1 adult', "nonappreciation"),
				 ('San Jose to Porto Alegre please. oh it needs to be between sept 18 to 22', "nonappreciation"),
				 ('Ok sold! please enter a booking for us', "nonappreciation"),
				 ('I can leave from Tel aviv and I want to go to San Jose with 7 adults for 2500', "nonappreciation"),
				 ('Well what about in Goiania.?', "nonappreciation"),
				 ('you are being unhelpful just answer yes or no, is it near a park or beach?', "nonappreciation"),
				 ('thak you', "appreciation"),
				 ('I shall take the 5 star package!', "nonappreciation"),
				 ('Okay but what if I leave from Naples instead. Can you get me to Manas from Naples?', "nonappreciation"),
				 ("I'm a woman! Try to find something 9000 or less if you can.", "nonappreciation"),
				 ("That's perfect.", "nonappreciation"),
				 ('ok. fine. I have a 4500 $ budjet and I will star as long as that money lasts. thx', "appreciation"),
				 ('sure fine flexible actually no i dont wanna go any more', "nonappreciation"),
				 ("No, unfortunately I can't. Guess I'll just take a staycation this time :disappointed: Thanks anyway", "appreciation"),
				 (" I'll book this one. Thank you, friend!", "appreciation"),
				 ('No we can only go to Porto... or Porto. Thanks.', "appreciation")]

	cl = NaiveBayesClassifier(train) # train the Naive Bayesian Classifier algorithm
	if cl.accuracy(validation) > 0.90: # check if the accuracy of the Naive Bayesian Classifier algorithm in classifying the validation data set is greater than 90%.
		cl.update(validation)	# update the Naive Bayesian Classifier algorithm with the validation data set.

		for m in final_utterance:
			if cl.classify(m) == "appreciation":
				classified_dict["appreciation"] += 1
			else:
				classified_dict["non-appreciation"] += 1

	# calculate the percentage of people expressing appreciation
	return "{}% people express appreciation.".format(float(classified_dict["appreciation"] / (float(classified_dict["appreciation"] + classified_dict["non-appreciation"]))) * 100)
Ejemplo n.º 50
0
class NLTKHashtagsClassifier(Classifier):
    """
    Classifies InstagramProfiles as blogger, brand or undecided.

    Currently is a PROTOTYPE.
    """

    # list of all available categories for categorization
    AVAILABLE_CATEGORIES = [
        'brand',
        'blogger',
        'undecided',
    ]

    classifier = None
    undecided_margin = None

    def __init__(self,
                 blogger_hashtags=[],
                 brand_hashtags=[],
                 undecided_margin=None):
        """
        Explicitly inits lists of hashtags and creates NLTK Classifier object.
        Lists are not intended to contain unique hashtags.
        :param blogger_hashtags: list of lists of hashtags suitable for bloggers
        :param brand_hashtags: list of lists of hashtags suitable for brands
        :param undecided_margin: probability margin when to consider classification result as undecided
        :return:
        """
        from textblob.classifiers import NaiveBayesClassifier

        initial_train = []
        for v in blogger_hashtags:
            initial_train.append((v, self.AVAILABLE_CATEGORIES[1]))
        for v in brand_hashtags:
            initial_train.append((v, self.AVAILABLE_CATEGORIES[0]))
        self.classifier = NaiveBayesClassifier(initial_train)
        initial_train = []

    def classify_unit(self, source=None, **kwargs):
        """
        This method is the core of classification algorithm. It receives source data for classification (object, model,
        string, etc.) and returns a value of classification category for this object.
        For example, we use InstagramProfile as source data, and result could be either 'brand' or 'blogger'
        or 'undecided'.
        """
        # return 'brand'

        cat_classified = self.classifier.classify(source)
        probability = self.classifier.prob_classify(source)

        # TODO: add probability_margin logic here

        return cat_classified

    def classify_queryset(self, source_queryset=None, **kwargs):
        """
        Helper method. Same as above but performs the whole queryset.
        Return queryset
        """
        # TODO: Think how to do it for this classifier.

        raise NotImplemented

    def update_classifier(self, extra_data=None):
        """

        """
        if extra_data is not None:
            self.classifier.update(extra_data)
def checkImage(app):

    warnings.filterwarnings("ignore")

    train = [
        ('Find a retail location near you.', 'moneypak'),
        ('Look for a MoneyPak in the prepaid section.', 'moneypak'),
        ('Take it to the cashier and load it with cash.', ' moneypak'),
        ('To pay fine you should enter the digits MoneyPak resulting pass in the payment form and press Pay MoneyPak.',
         'moneypak'),
        ('Amount of fine is 500 dollars.', 'moneypak'),
        ('After submitting the MoneyPak voucher your fine will be redeemed in  hours.',
         'moneypak'),
        ('You can settle the fine with MoneyPak vouchers.', 'moneypak'),
        ('As soon as the money arrives to the Treasury account your Mobile Device will be unlocked and all information will be decrypted in course of  hours.',
         'moneypak'),
        ('Fine should be paid not later than in 12 hours since current time.',
         'moneypak'),
        ('To unlock your device and to avoid other legal consequences you are obligated to pay a release fee of 300 dollars',
         'moneypak'),
        ('Payable through GreenDot MoneyPak.', 'moneypak'),
        ('load it with 300 dollars and enter the code.', 'moneypak'),
        ('The penalty as a base measure of punishment on you which you are obliged to pay in a current of is imposed.',
         'moneypak'),
        ('Click to win lottery worth ten millions.', 'moneypak'),
        ('This device lock is aimed to stop your illegal activity.', 'threat'),
        ('Your Device has been blocked up for safety reasons listed below.',
         'threat'),
        ('All the actions performed on this device are fixed.', 'threat'),
        ('You have been blocked to use your device. you need to pay a fee of rupees 500 to unblock.',
         'threat'),
        ('In less than 3 days your device data will be sent to our servers and you will be blocked to use your data. To abort this you should pay a ransom of 500 rupees.',
         'threat'),
        ('Please mind that both your personal identities and location are well identified and criminal case can be opened against you in course of  hours as of commission of crimes per above Articles.',
         'threat'),
        ('The penalty set must be paid in course of  hours as of the breach.',
         'threat'),
        ('On expiration of the term hours that follow will be used for automatic collection of data on yourself and your misconduct and criminal case will be opened against you.',
         'threat'),
        ('This device is locked due to the violation of the federal laws of the United States of America',
         'threat'),
        ('The messages with terroristic motives attempts in respect of political persons and also in respect of top public officials were sent from your device',
         'threat'),
        ('According to these data you will be permitted to pay fine in State Treasury in the consequence of initiative directed on protection of cyber space in U.S.A and in doing so to seize your clerical correspondence and taking your criminal case to court for decision formulation.',
         'threat'),
        ('In case of penalty non-redemption your case will be taken to court for the further decision formulation and determination of your criminal case.',
         'threat'),
        ('Seize clerical correspondence taking criminal case court decision formulation',
         'threat'),
        ('Penalty non redemption case taken court decision formulation determination criminal',
         'threat'),
        ('For this reason your device has been locked.', 'threat'),
        ("Information on your location and snaphots containing your face have been uploaded on the fbi cyber crime department's datacenter.",
         'threat'),
        ('According to these positions your actions bear criminal character and you are a criminal subject.',
         'threat'),
        ("If you don't adhere to the instructions provided you can be jailed under cyber crime law.",
         'threat'),
        ("Send your phone details if you want to unlock your phone.",
         'threat'),
        ('install', 'non-threat'),
        ('@string', 'non-threat'),
        ('The government policies have been changed', 'non-threat'),
        ('Under supervision of FBI.U.S.A. Ministry of Interior Interpol Copyright Alliance International Cyber Security Protection Alliance.',
         'non-threat'),
        ('You are accused of committing the crime envisaged by Article 1 of United States of America criminal law.',
         'non-threat'),
        ('Article 1 of United States of America criminal non-threat provides for the punishment of deprivation of liberty for terms from 5 to  years.',
         'non-threat'),
        ('Article Section Cause', 'non-threat'),
        ('Please ensure you have this app in your phone.', 'non-threat'),
        ('The policies of government  has been changed', 'non-threat'),
        ('Your phone has been affected by virus. Download this software',
         'non-threat'),
        ('you have been restricted by government agencies to continue',
         'non-threat'),
        ('Article 1 Section 8 Cause 8 of the Criminal Code provides for a fine of two to five hundred minimal wages or a deprivation of liberty for two to eight years.',
         'non-threat'),
        ('Thus violating article 2 of the Criminal Code of United States of America Article 2 of the Criminal Code provides for a deprivation of liberty for four to twelve years.',
         'non-threat'),
        ('Spam distribution on e-mail addresses of private persons containing the advertisement of illegal drugs addictive substances or psychopharmaceuticals on the territory of the USA',
         'law'),
        ('Disconnection or disposal of the device or your attempts to unlock the device independently will be apprehended as unapproved actions interfering the execution of the law of the united states of america (read section 1509 - obstruction of court orders and section 1510 - obstruction of criminal investigations.',
         'non-threat'),
        ('In this case and in case of penalty non-payment in a current of three calendar days from the date of this notification. The total amount of penalty will be tripled and the respective fines will be charged to the outstanding penalty.',
         'non-threat'),
        ('In case of dissent with the indicted prosecution you have the right to challenge it in court.',
         'non-threat'),
        ('Distribution of false information about politicians.', 'non-threat'),
        ('You have been subjected to violation of Copyright and Related Rights Law (Video Music Software) and illegally using or distributing copyrighted contents',
         'copyright'),
        ('Distribution of episodes containing abuse or p**n material with participation of animals',
         'p**n'),
        ('Distribution and keeping of p**n material with participation of children',
         'p**n'),
        ('You are accused of viewing and storage of forbidden child pornography and zoophilia.',
         'p**n'),
        ('You have violated World Declaration on non-proliferation of child pornography.',
         'p**n'),
        ('You have been viewing or distributing prohibited Pornographic content (Child Pornography photos and etc were found on your device).',
         'p**n'),
        ('Video materials of p**n content including sexual acts with children (persons up to 9 years old).',
         'p**n'),
        ('Video materials containing the scene of child (persons up to 9 years) and animals abuse.',
         'p**n'),
        ('Video materials are made with the help of your device contain the scenes of cruel child and animals abuse.',
         'p**n'),
        ('As a result of full scanning of your device	 some suspicious files have been found.',
         'p**n'),
        ('Your attendance of the forbidden pornographic sites has been fixed.',
         'p**n'),
        ('Depiction of animal cruelty.', 'p**n'),
        ('Whoever knowingly creates sells or possesses a depiction of animal cruelty with the intention of placing that depiction in interstate or foreign commerce for commercial gain shall be fined under this title or imprisoned not more than 5 years or both.',
         'p**n'),
        ('Certain activities relating to material constituting or containing child pornography.',
         'p**n'),
    ]

    c1 = NaiveBayesClassifier(train)

    path = "F:\\Apktool\\%s\\res\\" % app
    os.chdir(path)

    all_files = os.listdir(path)
    #print(all_files)

    list = []
    textL = []
    imageName = []
    z = 0
    for i in all_files:
        path = "F:\\Apktool\\%s\\res\\%s" % (app, i)
        os.chdir(path)
        all_files_2 = os.listdir(path)

        #print(all_files_2)

        for j in all_files_2:
            ext = os.path.splitext(j)[-1].lower()
            if (ext == ".gif" or ext == ".jpg" or ext == ".png"
                    or ext == ".jpeg"):
                #print("\n" + j)
                imageName.append(j)
                im = Image.open(j)
                text = pytesseract.image_to_string(im, lang='eng')
                if (text != ""):
                    print("   ")
                    z += 1
                    #print(text)
                    textL.append(text)
                    blob = TextBlob(text, classifier=c1)
                    sr = blob.classify()
                    #print(sr)
                    list.append(sr)
                    print("   ")

    print("\n")
    while (z > 0):
        print "TEXT IS    ", textL[z - 1], " ", list[
            z - 1], "           IMAGE IS  ", imageName[z - 1]
        #print"Image name is ",imageName[z-1]
        z -= 1

    count = 0
    for i in list:
        if (i == "threat"):
            count = count + 1
    print("\n")
    if (count >= 1):
        print("THREATENING IMAGE PRESENT")
        c = 1
    if (count == 0):
        print("Threatening Image Not Present")
        c = 0
    return c
URL = 'localhost'
db_name = 'danilo_db'
server = couchdb.Server('http://' + URL + ':5984/')

try:
    print(db_name)
    db = server[db_name]
    print('conexion exitosa')
except:
    sys.stderr.write("Error: Base de datos no encontrado. Terminar\n")
    sys.exit()

view = "vistaQuito/vistaQuito"

with open('train.json', 'r') as fp:
    cl = NaiveBayesClassifier(fp, format="json")

#sys.stdout=open("Train1.json","w")
if len(db.view(view)) > 0:
    for data in db.view(view):
        json_data = {}
        json_data1 = {}
        json_data = db.get(data['id'])
        textoIngesado = data['value']
        textoIngesado1 = TextBlob(
            expresionRegularFiltrar.sub('', textoIngesado))
        polarity_value = textoIngesado1.sentiment.polarity * 100.0
        if polarity_value == 0:
            polarity = 'neu'
        elif polarity_value < 0:
            polarity = 'no'
    f1 = open(model1_name, 'rb')
    model1 = pickle.load(f1)
    f1.close()

except IOError:

    print("model1 does not exist, so we are generating a new one.")

    from textblob import TextBlob
    from textblob.sentiments import NaiveBayesAnalyzer
    from textblob.sentiments import PatternAnalyzer
    from textblob.classifiers import NaiveBayesClassifier


    with open('train_part1.json', 'r') as train_file:
        model1 = NaiveBayesClassifier(train_file, format="json")

    #############################################################
    # SAVING THE MODEL CALIBRATED
    #############################################################
    import pickle
    f = open(model1_name, 'wb')
    pickle.dump(model1, f)
    f.close()

finally:
    print("model1 has just being loaded, and ready to be used.")
    print("#################################################")
    print("##################  model1  ######################")

with open('test.json', 'r') as test_file:
# -*- coding: utf-8 -*-
"""
Created on Tue May  1 23:24:59 2018

@author: Rahul Vedanta
"""

#First we’ll read some training data.
import csv
trainData = []

#opening and reading the labelled data csv file
with open('C:\\Users\\Ritesh\\Desktop\\txt\\output_1.csv') as trainingFile:
    reader = csv.reader(trainingFile, delimiter=',')
    for row in reader:
        if len(row) != 0:
            trainData.append(tuple(row))
trainingFile.close()

#Now we’ll create a Naive Bayes classifier, passing the training data into the constructor.
from textblob.classifiers import NaiveBayesClassifier
classifier = NaiveBayesClassifier(trainData)

#write classifier to file
import pickle
f = open('C:\\Users\\Ritesh\\Desktop\\txt\\myClassifier_1.pickle', 'wb')
pickle.dump(classifier, f)
f.close()
Ejemplo n.º 55
0
    j_file = open(i_file, "r")
    j_contents = j_file.read()
    j_file.close()

    start_offset = j_contents.find('<section id="postingbody">')
    end_offset = j_contents.find('</section>', start_offset)
    post_body = j_contents[start_offset:end_offset]
    post_body = post_body.replace('<section id="postingbody">', ' ')
    try:
        post_body = post_body.decode('utf-8')
    except UnicodeDecodeError:
        continue

    test.append([post_body, i_file])

Bayes = NaiveBayesClassifier(train)

print os.getcwd()

pos = []
neg = []
for body in test:

    judge = Bayes.classify(body[0])
    if judge == "positive":
        call(['mv', "./" + body[1], "john/"])
        os.getcwd()
    if judge == "negative":
        call(['mv', "./" + body[1], "non_john/"])

os.mkdir("hard_to_classify")
Ejemplo n.º 56
0
# Please install textblob module

from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier

train = [('I love this sandwich.', 'pos'),
         ('This is an amazing place!', 'pos'),
         ('I feel very good about these beers.', 'pos'),
         ('I do not like this restaurant', 'neg'),
         ('I am tired of this stuff.', 'neg'),
         ("I can't deal with this", 'neg'), ("My boss is horrible.", "neg")]

cl = NaiveBayesClassifier(train)

print(cl.classify("I feel amazing!"))
blob = TextBlob("The beer is good. But the hangover is horrible.",
                classifier=cl)

for s in blob.sentences:
    print(s)
    print(s.classify())
Ejemplo n.º 57
0
import csv

from textblob.classifiers import NaiveBayesClassifier
#from textblob import TextBlob
#from sklearn.naive_bayes import GaussianNB

train = []
test = []  #Array Definition
path1 = r'D:\1\training_data.csv'  #Address Definition

path11 = r'D:\test_data.csv'
with open(path1, 'r', encoding="utf8") as f1:  #Open File as read by 'r'
    reader = [tuple(line[col] for col in (1, 2)) for line in csv.reader(f1)]

print(reader)
NB = NaiveBayesClassifier(reader)
print('it has finished..')
commonproductname = []
Probability = []

path11 = r'D:\test_data_1.csv'
with open(path11, 'r', encoding="utf8") as f11:  #Open File as read by 'r'
    reader11 = csv.reader(f11)
    next(reader11, None)  #Skip header because file header is not needed
    for row11 in reader11:  #fill array by file info by for loop
        test.append(row11[0])

for i in range(0, len(test)):
    prob_dist = NB.prob_classify(test[i])
    commonproductname.append(prob_dist.max())
    Probability.append(prob_dist.prob(prob_dist.max()))
    train_size = 0.9
    train_index = int(len(textList) * train_size)
    #append base label
    for text in textList[:train_index]:
        train.append((text, "pos"))
    for text in textList[train_index:]:
        test.append((text, "pos"))

    #append other label
    for other_key in result.keys():
        if other_key != key:
            textList = result[other_key]
            random.shuffle(textList)
            train_size = 0.9
            train_index = int(len(textList) * train_size)
            #append other label
            for text in textList[:train_index]:
                train.append((text, "neg"))
            for text in textList[train_index:]:
                test.append((text, "neg"))

    cl = NaiveBayesClassifier(train)
    accuracy = cl.accuracy(test)
    print("class :{} train:{} test:{} acc:{}".format(key, len(train),
                                                     len(test), accuracy))
    cl_list.append(cl)

#save model
object = cl_list
file = open('tweet-categorize-multiclass-array.obj', 'wb')
pickle.dump(object, file)
Ejemplo n.º 59
0
 def __init__(self):
   print("initialized CerberusRent")
   with open(JSON_FILE) as data_file:
     self.classifier = NaiveBayesClassifier(data_file, format="json")
random.seed(1)

train = [('I love this sandwich.', 'pos'),
         ('This is an amazing place!', 'pos'),
         ('I feel very good about these beers.', 'pos'),
         ('This is my best work.', 'pos'), ("What an awesome view", 'pos'),
         ('I do not like this restaurant', 'neg'),
         ('I am tired of this stuff.', 'neg'),
         ("I can't deal with this", 'neg'), ('He is my sworn enemy!', 'neg'),
         ('My boss is horrible.', 'neg')]
test = [('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'),
        ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'),
        ('Gary is a friend of mine.', 'pos'),
        ("I can't believe I'm doing this.", 'neg')]

cl = NaiveBayesClassifier(train)

# Grab some movie review data
reviews = [(list(movie_reviews.words(fileid)), category)
           for category in movie_reviews.categories()
           for fileid in movie_reviews.fileids(category)]
random.shuffle(reviews)
new_train, new_test = reviews[0:100], reviews[101:200]

# Update the classifier with the new training data
cl.update(new_train)
#for sentence in blob.sentences:
#	print(sentence.classify());
# Compute accuracy
print(cl.classify("There burgers are amazing"))
print(cl.classify("There burgers are not amazing"))