def decide_actionable_tweet(doc_standard):
 actionable_tweet = []
 from textblob.classifiers import NaiveBayesClassifier as NBC
 from textblob import TextBlob
 training_corpus = [ ('naredra modi is good politician','not_actionable'),
                    ('how congress become good oppositor','actionable'),
                    ('python is popular language','not_actionable'),
                    ('here is new version of python available see it','actionable'),
                    ('retweet why india is poor country','actionable'),
                    ('Pro cubbadi startion on 1 august 2017 ','not_actionable'),
                    ('book ticket for goa at reasonable cost','actinable')]

 test_corpus = [('here is new version of motorola see it','actionable'),
               ('hellow friends how are you','not_actionable')]

 model = NBC(training_corpus)

 print("model",model)
 try:
  for doc in doc_standard:         # for testing use other list instead of doc_standard
    result = model.classify(doc)

    if result is 'actionable':
        actionable_tweet.append(doc)
 except:
    print("error in classify")

 print("actionable_tweet", actionable_tweet)
 return actionable_tweet
    def investigate(self):

        data = self.scrape(self.base_urls)
        train_crud = CRUD("sqlite:///database.db", Ads, "ads")
        #getting dummy data from http://www.dummytextgenerator.com/#jump
        dummy_crud = CRUD("sqlite:///database.db", TrainData, "training_data")
        train = train_crud.get_all()
        dummy = dummy_crud.get_all()
        t_docs = [elem.text for elem in train_crud.get_all()
                  ]  #all documents with trafficking
        train = [(elem.text, "trafficking")
                 for elem in train] + [(elem.text, "not trafficking")
                                       for elem in dummy]
        cls = []
        #make use of tdf-idf here
        #add in this example: http://scikit-learn.org/0.11/auto_examples/document_classification_20newsgroups.html
        cls.append(NBC(train))
        cls.append(DTC(train))
        for datum in data:
            for cl in cls:
                if cl.classify(datum["text_body"]) == "trafficking":
                    self.save_ads([datum])
            #so I don't have to eye ball things
            if doc_comparison(datum["text_body"], t_docs) == "trafficking":
                self.save_ads([datum])
                if self.doc_comparison(datum["text_body"],
                                       t_docs) == "trafficking":
                    self.save_ads([datum])
        time.sleep(700)  # wait ~ 12 minutes
        self.investigate()  #this is an infinite loop, which I am okay with.
def train_classifier(training_corpus):
    """Building Naive Bayes Classifier model.

    :param training_corpus: List of training tuples containing prhase and class.
    :returns: Navie Bayes Classifier model.
    """
    print("Training model...")
    return NBC(training_corpus)
Exemple #4
0
def nlp(corpus):
    from textblob.classifiers import NaiveBayesClassifier as NBC
    from textblob import TextBlob
    training_corpus = [
        ('What is the status of covid in kerela?', 'Class_B'),
        ("Who is the latest celebrith to be tested positive?", 'Class_B'),
        ('When will vaccines be found?', 'Class_B'),
        ('When will colleges open in karnataka', 'Class_B'),
        ('What is the active case count in Goa?', 'Class_A'),
        ('Total deaths in bangalore', 'Class_A'),
        ('number of tested positive cases in the world', 'Class_A')
    ]
    test_corpus = [("Is joe tested positive?", 'Class_B'),
                   ("increasing cases in kerela", 'Class_A'),
                   ("chennais active cases", 'Class_A'),
                   ("when will the vaccine be found", 'Class_B')]

    model = NBC(training_corpus)
    op = model.classify(corpus)
    #print(model.accuracy(test_corpus))
    return op
Exemple #5
0
def naive_bayes_classify(data):
    class_to_predict = 'type'  # product importance
    all_data = [
        tuple(x)
        for x in data[['text', class_to_predict]].to_records(index=False)
    ]

    text_counts = {}
    for item in all_data:
        for word in set(item[0].split()):
            if word in text_counts:
                text_counts[word] += 1
            else:
                text_counts[word] = 1

    for i in range(len(all_data)):
        new_text = ''
        for word in all_data[i][0].split():
            if text_counts[word] >= 5:
                new_text += ' ' + word
        all_data[i] = (new_text, all_data[i][1])

    print('Finished preprocessing!')

    test_corpus = all_data[3000:3600]
    training_corpus = all_data[:3000]

    model = NBC(training_corpus, verbose=True)
    print('Done training!')
    print('Accuracy: ' + str(model.accuracy(test_corpus)))

    y_pred = []
    y_true = []
    for test_item in test_corpus:
        y_pred.append(model.prob_classify(test_item[0]).max())
        y_true.append(test_item[1])

    print('F1 score: ' + str(f1_score(y_true, y_pred, average='weighted')))
Exemple #6
0
    return noise_free_text


user_text = ""

noicy_text = _remove_noise(user_text.lower())

noicy_ = lemmatizer.lemmatize(noicy_text, "v")

training_corpus = [('I am exhausted of this work.', 'Class_B'),
                   ("I can't cooperate with this", 'Class_B'),
                   ('He is my badest enemy!', 'Class_B'),
                   ('My management is poor.', 'Class_B'),
                   ('I love this burger.', 'Class_A'),
                   ('This is an brilliant place!', 'Class_A'),
                   ('I feel very good about these dates.', 'Class_A'),
                   ('This is my best work.', 'Class_A'),
                   ("What an awesome view", 'Class_A'),
                   ('I do not like this dish', 'Class_B')]
test_corpus = [("I am not feeling well today.", 'Class_B'),
               ("I feel brilliant!", 'Class_A'),
               ('Gary is a friend of mine.', 'Class_A'),
               ("I can't believe I'm doing this.", 'Class_B'),
               ('The date was good.', 'Class_A'),
               ('I do not enjoy my job', 'Class_B')]

model = NBC(training_corpus)

print(model.classify(input()))

print(noicy_)
Exemple #7
0
def naive_bayes(train_data):
    """
    cl.classify("some new text") #a label returned
    """
    return NBC(train_data)
Exemple #8
0
    def scrape(self,links=[],auto_learn=False,long_running=False,translator=False):
        responses = []
        values = {}
        data = []

        if links == []:
            for base_url in self.base_urls:
                r = requests.get(base_url)
                text = unidecode(r.text)
                html = lxml.html.fromstring(text)

                links = html.xpath("//div[@class='cat']/a/@href")
                for link in links:
                    if len(self.base_urls) > 1 or len(self.base_urls[0]) > 3:
                        time.sleep(random.randint(1,2))
                        if long_running:
                            time.sleep(random.randint(5,27))
                    try:
                        responses.append(requests.get(link))
                        print link
                    except requests.exceptions.ConnectionError:
                        print "hitting connection error"
                        continue
        else:
            for link in links:
                if len(self.base_urls) > 1 or len(self.base_urls[0]) > 3:
                    time.sleep(random.randint(1,2))
                    if long_running:
                        time.sleep(random.randint(5,17))
                try:
                    responses.append(requests.get(link))
                    print link
                except requests.exceptions.ConnectionError:
                    print "hitting connection error"
                    continue

        for r in responses:
            text = r.text
            html = lxml.html.fromstring(text)
            values["title"] = html.xpath("//div[@id='postingTitle']/a/h1")[0].text_content()
            values["link"] = unidecode(r.url)
            values["new_keywords"] = []
            try:
                values["images"] = html.xpath("//img/@src")
            except IndexError:
                values["images"] = "weird index error"
            pre_decode_text = html.xpath("//div[@class='postingBody']")[0].text_content().replace("\n","").replace("\r","")  
            values["text_body"] = pre_decode_text 
            try:
                values["posted_at"] = html.xpath("//div[class='adInfo']")[0].text_content().replace("\n"," ").replace("\r","")
            except IndexError:
                values["posted_at"] = "not given"
            values["scraped_at"] = str(datetime.datetime.now())
            body_blob = TextBlob(values["text_body"])
            title_blob = TextBlob(values["title"])
            values["language"] = body_blob.detect_language() #requires the internet - makes use of google translate api
            values["polarity"] = body_blob.polarity
            values["subjectivity"] = body_blob.sentiment[1]
            translated = translator or values["language"] == "es"
            if translated:
                values["translated_body"] = body_blob.translate(from_lang="es")
                values["translated_title"] = title_blob.translate(from_lang="es")
            else:
                values["translated_body"] = "none"
                values["translated_title"] = "none"
            text_body = values["text_body"]
            title = values["title"]

            if translated:
                text_body = values["translated_body"]
                title = values["translated_title"]

            if auto_learn:
                train = pickle.load(open("train.p","rb"))
                cls = []
                cls.append(NBC(train))
                cls.append(DTC(train))
                #increase this number
                trk_count = 0
                for cl in cls:
                    if cl.classify(text_body) == "trafficking":
                        trk_count += 1

                if float(trk_count)/len(cls) > 0.5:
                    train = pickle.load(open("train.p","rb"))
                    train.append((values["text_body"],"trafficking") )
                    pickle.dump(train,open("train.p","wb"))
                    values["trafficking"] = "found"
                else:
                    values["trafficking"] = "not_found"
                #To do set up postmark here.
                #Documentation: https://devcenter.heroku.com/articles/postmark
                #even more docs: https://postmarkapp.com/servers/645009/get_started
                
            else:
                values["trafficking"] = "not_found"
                           
            values["child_urls"] = []
            for keyword in self.child_keywords:
                if keyword in text_body:
                    values["child_urls"].append(values["link"])
                elif keyword in title:
                    values["child_urls"].append(values["link"])

            values["trafficking_urls"] = []
            for keyword in self.trafficking_keywords:
                if keyword in text_body:
                    values["trafficking_urls"].append(values["link"])
                elif keyword in title:
                    values["trafficking_urls"].append(values["link"])

            values["new_keywords"].append(self.pull_keywords(text_body))
            values["new_keywords"].append(self.pull_keywords(title))
            values = self.phone_number_parse(values)
            numbers = pickle.load(open("numbers.p","rb"))
            values["network"] = []
            for network in numbers.keys():
                if values["phone_number"] in numbers[network]:
                    values["network"].append(network)
            data.append(values)
        self.save_ads(data)
        return data
Exemple #9
0
nlp = spacy.load('en_core_web_sm')
from textblob.classifiers import NaiveBayesClassifier as NBC
#project_root = os.path.dirname(os.path.realpath(os.path.join(__file__, '..', '..')))
misswords = [
    "Sorry I didn't get you", "I apologize i couldnt decifer you",
    "Sorry i am Dumb", "Can you repeat it please?", "Pardon?",
    "I am not a human so Plz cooporate sir", "I didn't get you"
]
with open('C:\\Users\\Mayukh\\Desktop\\ChatBOT\\ChatTemplate\\file.json',
          'r') as json2:
    data = json.load(json2)
training = []
for i in data:
    for sentence in data[i]['train']:
        training.append((sentence, i))
model = NBC(training)
app = Flask(__name__, static_url_path='', static_folder='static')


@app.route('/')
def chat():
    return render_template('chat.html')


@app.route('/chat')
def index():
    return render_template('index.html')


@app.route('/about')
def about():
Exemple #10
0
#### Naive Bayes ####
#####################
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from textblob.classifiers import NaiveBayesClassifier as NBC

df = pd.DataFrame({"labels": trainLabels, "trainData": trainData})

train, test = train_test_split(df, test_size = 0.25)

training = zip(train["trainData"].tolist() , train["labels"].tolist())
testing = zip(test["trainData"].tolist() , test["labels"].tolist())

## training model
%time model = NBC(training)

%time print(model.accuracy(training))
## getting accuracy of 90%

## Shows important features for detecting intent
print model.show_informative_features()
#Most Informative Features
#        contains(please) = True              Yes : No     =     10.0 : 1.0
#            contains(ve) = True               No : Yes    =      9.9 : 1.0
#        contains(verifi) = True              Yes : No     =      9.3 : 1.0
#          contains(sale) = True               No : Yes    =      9.3 : 1.0
#        contains(moment) = True              Yes : No     =      8.6 : 1.0
#       contains(compani) = True               No : Yes    =      7.3 : 1.0
#      contains(deliveri) = True               No : Yes    =      6.9 : 1.0
#    contains(unsubscrib) = True               No : Yes    =      6.9 : 1.0
Exemple #11
0
from textblob.classifiers import NaiveBayesClassifier as NBC
from textblob import TextBlob

with open('train_ticketdata.csv','r') as training_corpus:
	model = NBC(training_corpus, format='csv')

with open('test_ticketdata.csv','r') as test_corpus:
	accuracy = model.accuracy(test_corpus)

def apply_nlp(data):
    print("start nlp")
    print(data)
    print("end nlp")
    return model.classify(data)
	
def get_accuracy():
    return accuracy
Exemple #12
0
from textblob.classifiers import NaiveBayesClassifier as NBC
import glob
import sys
f = glob.glob("/home/aravind/AI-Nielit/NLP/Day3_Nov14/A4/Data/*/*")
trainData = []
for i in f:
    s = i.split('/')
    cls = s[-2]
    f = open(i, "r")
    f = f.read().strip()
    tup = f, cls
    trainData.append(tup)
print "Data Importing Completed"
classifier = NBC(trainData)
print "Training Complete"
f = open('Test', "r")
f1 = f.read()
testData = []
testData.append(f1)
print "Test Data\n-------------------------------------"
print f1
print "-------------------------------------"
c = classifier.classify(testData)
print c
Exemple #13
0
#!/usr/bin/env python  
# -*- coding: utf-8 -*-  
# @Time    : 2018/7/3 下午4:00  
# @Author  : Kaiyu  
# @Site    :   
# @File    : test.py

from textblob.classifiers import NaiveBayesClassifier as NBC
from textblob import TextBlob
import json

if __name__ == '__main__':
    with open('okoo-merged-labels.json', encoding='utf-8') as f:
        data = json.load(f)['all']
        data = [(item['text'], str(item['merged_label'])) for item in data]
        train_data = data[:-1000]
        test_data = data[-1000:-1]
        model = NBC(train_data)
        for test_item in test_data:
            #
            label_ = model.classify(test_item[0])
            print('True: {} predict: {}'.format(str(test_item[1]), label_))
        print(model.accuracy(test_data))
from textblob.classifiers import NaiveBayesClassifier as NBC

train = pickle.load(open("train.p", "rb"))

cl = NBC(train)
Exemple #15
0
from textblob.classifiers import NaiveBayesClassifier as NBC

with open('train.json', 'r') as fp:
    cl = NBC(fp, format="json")

with open('test.json', 'r') as fp:
    print(str(cl.accuracy(fp, format="json") * 100) + ' %')

with open('file.txt', 'r') as fp:
    for i in fp.readlines():
        print(i[:-4] + ': ' + cl.classify(i))