Python NaiveBayesClassifier.classifyの例、textblob.classifiers.NaiveBayesClassifier.classify Pythonの例

コード例 #1

0

ファイルを表示

ファイル: classifier.py プロジェクト: Jamesm4/Sentiment-Classifier

class Classifier:

  def __init__(self):
    fp = open("./data/train.csv")
    self.cl = NaiveBayesClassifier(fp, format="csv")
    fp.close()

  def test(self):
    return self.cl.classify("This is a test sentence")

  def classify(self, text):
    return self.cl.classify(text)

  def n_classify(self, text):
    dist = self.cl.prob_classify(text)

    probs = {"sentiments": []}
    for s in dist.samples():
      if dist.prob(s) >= .10:
        probs["sentiments"].append({s: dist.prob(s)})

    return json.dumps(probs)

  def accuracy(self):

    fp = open('./data/train.csv')
    train_accuracy = self.cl.accuracy(fp, format="csv")
    fp.close()
    fp = open('./data/test.csv')
    test_accuracy = self.cl.accuracy(fp, format="csv")
    fp.close()
    return json.dumps({"train_accuracy": train_accuracy, "test_accuracy": test_accuracy})

  def labels(self):
    return json.dumps({"labels": self.cl.labels()})

コード例 #2

0

ファイルを表示

ファイル: server.py プロジェクト: sanchitghai24/ChatBot

def classifier(something):
    speech = something

    train = []
    test = []

    with open("training.csv") as csvfile:
        reader = csv.reader(csvfile)  # change contents to floats
        for row in reader:  # each row is a list
            train.append(row)

        with open("test.csv") as csvfile:
            reader = csv.reader(csvfile)  # change contents to floats
            for row in reader:  # each row is a list
                test.append(row)

    cl = NaiveBayesClassifier(train)
    cl.classify("This is an amazing library!")
    prob_dist = cl.prob_classify("This one's a doozy.")
    prob_dist.max()
    round(prob_dist.prob("machine"), 2)
    round(prob_dist.prob("no machine"), 2)
    blob = TextBlob(speech, classifier=cl)
    blob.classify()
    for s in blob.sentences:
        print("\n\n\n" + str(s))
        print("\n" + str(s.classify()))
        return (s.classify())

コード例 #3

0

ファイルを表示

ファイル: classifier.py プロジェクト: giuse88/expense_classifier

class ExpenseClassifier:

    def __init__(self):
        training_data = self._load_data("data")
        self.category_classifier  = NaiveBayesClassifier([(x[0], x[1]) for x in  training_data])
        self.avoidability_classifier = NaiveBayesClassifier([(x[0], x[2]) for x in  training_data])
        self.ordinary_classifier =  NaiveBayesClassifier([(x[0], x[3]) for x in  training_data])

    def classify(self, description):
        res = {}
        res['category'] = self.category_classifier.classify(description)
        res['avoidable'] = self.avoidability_classifier.classify(description)
        res['ordinary'] = self.ordinary_classifier.classify(description)
        return res

    def accuracy(self):
        test_data = self._load_data("test")
        res = {}
        res['category'] = self.category_classifier.accuracy([(x[0], x[1]) for x in test_data])
        res['avoidable'] = self.avoidability_classifier.accuracy([(x[0], x[2]) for x in test_data])
        res['ordinary'] = self.ordinary_classifier.accuracy([(x[0], x[3]) for x in test_data])
        return res

    def _load_data(self, folder):
        data = []
        for f in glob.glob(folder + "/*.csv"):
            with open(f) as csvfile:
                spamreader = csv.reader(csvfile, delimiter=',')
                for row in spamreader:
                    if row[DESCRIPTION] and row[CATEGORY] and row[AVOIDABLE] and row[ORDINARY]:
                        data.append((norm(row[DESCRIPTION]), row[CATEGORY], row[AVOIDABLE], row[ORDINARY]))
        return data

コード例 #4

0

ファイルを表示

ファイル: SentimentAnalysis.py プロジェクト: findingnino/SentimentAnalyzer

def main():
    json = raw_input("Where is the json training set?")
    print "Program start", time.ctime() #debug
    with open(json, 'r') as file:
        classifier = NaiveBayesClassifier(file, format='json')
        print "Classifier done!", time.ctime() #debug
    test = raw_input("Where is the test eml_folder?")
    print "Testing...", time.ctime()
    for emails in dir_list(test):
        print classifier.classify(emails)
    print "Testing done", time.ctime()

コード例 #5

0

ファイルを表示

ファイル: time_adapter.py プロジェクト: Endika/ChatterBot

class TimeLogicAdapter(LogicAdapter):

    def __init__(self, **kwargs):
        super(TimeLogicAdapter, self).__init__(**kwargs)

        training_data = [
            ("what time is it", 1),
            ("do you know the time", 1),
            ("do you know what time it is", 1),
            ("what is the time", 1),
            ("do you know the time", 0),
            ("it is time to go to sleep", 0),
            ("what is your favorite color", 0),
            ("i had a great time", 0),
            ("what is", 0)
        ]

        self.classifier = NaiveBayesClassifier(training_data)

    def process(self, statement):
        now = datetime.now()

        confidence = self.classifier.classify(statement.text.lower())
        response = Statement("The current time is " + now.strftime("%I:%M %p"))

        return confidence, response

コード例 #6

0

ファイルを表示

ファイル: chatbot.py プロジェクト: sachushaji/chatbot

def reply_engine(sentence,train):
    cl = NaiveBayesClassifier(train)
    k = str(cl.classify((sentence)))
    if k == 'pos':
        return random.choice(POSITIVE_RESPONSE)
    elif k == 'neg':
        return random.choice(NEGATIVE_RESPONSE)

コード例 #7

0

ファイルを表示

def detecting_fake_news(var):

    train = [
('15 september is the day when we go back to school.', 'true'),
('Corona isn't deadly.', 'false'),
    ('Tunisian next elections is in 2024 .', 'true'),
    ('Says the Annies List political group supports third-trimester abortions on demand.', 'false'),
    ('Donald Trump is against marriage equality. He wants to go back.', 'true'),
    ('Says nearly half of Oregons children are poor.', 'true'),
    ('State revenue projections have missed the mark month after month.', 'true'),
    ("In the month of January, Canada created more new jobs than we did.", 'true'),
    ('If people work and make more money, they lose more in benefits than they would earn in salary.', 'false'),
    ('Originally, Democrats promised that if you liked your health care plan, you could keep it. One year later we know that you need a waiver to keep your plan.', 'false'),
    ("We spend more money on antacids than we do on politics.", 'false'),
    ('Barack Obama and Joe Biden oppose new drilling at home and oppose nuclear power.', 'false'),
    ('President Obama once said he wants everybody in America to go to college.', 'false')
    ]
    test = [
    ('Because of the steps we took, there are about 2 million Americans working right now who would otherwise be unemployed.', 'true'),
    ('Scientists project that the Arctic will be ice-free in the summer of 2018', 'false'),
    ("You cannot build a little guy up by tearing a big guy down -- Abraham Lincoln said it.", 'false'),
    ("One man opposed a flawed strategy in Iraq. One man had the courage to call for change. One man didn't play politics with the truth.", 'true'),
    ('When I was governor, not only did test scores improve we also narrowed the achievement gap.', 'true'),
    ("Ukraine was a nuclear-armed state. They gave away their nuclear arms with the understanding that we would protect them.", 'false')
    ]   

    cl = NaiveBayesClassifier(train)
    result = cl.classify(var) 

    return result

コード例 #8

0

ファイルを表示

ファイル: time_adapter.py プロジェクト: zzl0/ChatterBot

class TimeLogicAdapter(LogicAdapter):
    """
    The TimeLogicAdapter returns the current time.
    """

    def __init__(self, **kwargs):
        super(TimeLogicAdapter, self).__init__(**kwargs)

        training_data = [
            ('what time is it', 1),
            ('do you know the time', 1),
            ('do you know what time it is', 1),
            ('what is the time', 1),
            ('it is time to go to sleep', 0),
            ('what is your favorite color', 0),
            ('i had a great time', 0),
            ('what is', 0)
        ]

        self.classifier = NaiveBayesClassifier(training_data)

    def process(self, statement):
        now = datetime.now()

        confidence = self.classifier.classify(statement.text.lower())
        response = Statement('The current time is ' + now.strftime('%I:%M %p'))

        return confidence, response

コード例 #9

0

ファイルを表示

def train_data(ticker):

    df = pd.read_csv('../tmp/training_data/' + ticker + '2015-2016_data1.csv')
    train_df = df[['snippet', 'price change']]
    print "Training News Dataset"
    print train_df.head(5)
    cl = NaiveBayesClassifier(train_df.as_matrix(columns=None))

    df = pd.read_csv('../tmp/training_data/' + ticker + '2016-2017_data1.csv')
    dataset = df[['snippet', 'price change']]

    classified = []
    right = 0
    #print dataset.head(n=5)
    print "\nClassifying dataset\n"
    for index, row in dataset.iterrows():
        classified.append(cl.classify(row[0]))
        right += 1 if row[1] == classified[index] else 0

    dataset['News Sent.'] = classified
    path = '../tmp/results/News/' + ticker + '_results.csv'
    dataset.to_csv(path, encoding='utf-8', index=False)
    #dataset['Price Sent.'] = real_sent
    print dataset[['snippet', 'price change', 'News Sent.']].head(n=20)
    total = len(dataset['snippet'])
    print "\nCalculating "
    print "\nRight %d, Total %d" % (right, total)
    print "Correct percentage %.2f %%" % ((1.0 * right / total) * 100)
    #print cl.classify(dataset.as_matrix(columns=None))
    print cl.show_informative_features(10)

コード例 #10

0

ファイルを表示

def qa1():
    trainData = [
        ('Augmentation mentoplasty using Mersilene mesh.  Many different materials are available for augmentation mentoplasty.  However, the optimal implant material for chin implantation has yet to be found.  During the past several years, a number of experienced surgeons have turned to the use of Mersilene mesh.  Mersilene mesh is a non-absorbable Dacron polyester fiber that can be conformed easily into layers to achieve tailored dimensions and shape.  At the McCollough Plastic Surgery Clinic PA, Birmingham, Ala, 277 patients over a 10-year period underwent chin augmentation with Mersilene mesh implants.  The material provides excellent tensile strength, durability, and surgical adaptability.  The overall complication rate was 3.2% (nine patients); infection rate, 2.5% (seven patients); and removal secondary to infection, 1.7% (five patients).  Based on this 10-year experience, Mersilene mesh remains our material of choice for chin augmentation.',
         'C01'),
        ('Multiple intracranial mucoceles associated with phaeohyphomycosis of the paranasal sinuses.  The purpose of this article is to alert clinicians to a new pathogenic fungus of the paranasal sinuses called Exserohilum rostratum.  Exserohilum species are one of the etiologic agents of phaeohyphomycosis, a constellation of entities caused by dematiaceous fungi.  This class of fungal sinus infection has emerged only in the past decade; it occurs primarily in immunocompetent individuals and produces a tenacious, progressive pansinusitis.  To our knowledge, this study describes the first case of multiple intracranial mucoceles secondary to E rostratum.  The diagnostic workup includes computed tomography and magnetic resonance imaging followed by direct microscopic examination of tissue biopsy specimens.  A craniotomy followed by a bilateral external ethmoidectomy was necessary for complete extirpation of the infected mucoceles.  Aggressive surgical management of this mycotic infection is described',
         'C01'),
        ('Laser photodynamic therapy for papilloma viral lesions.  Photodynamic therapy was tested for its therapeutic efficacy in eradicating rabbit papilloma warts. The wild-type viral warts suspension was used to induce treatable papilloma warts in the cutaneous tissue of Dutch Belted rabbits. The photosensitizing agents used intravenously were Photofrin II at 10 mg/kg of body weight and Chlorin e6 monoethylene diamine monohydrochloric acid (Chlorin e6 med HCl) at 1 mg/kg of body weight.  The lasers used were an argon-dye laser at 628 and 655 nm and a gold vapor laser at 628 nm.   The irradiances of 25 to 180 mW/cm2 were applied topically with an end-on lens optical fiber with total radiant doses of 7.5 to 54 J/cm2.  Photofrin II and the argon-dye laser at the highest light dosage (54 J/cm2) and Chlorin e6 monoethylene diamine monohydrochloride administered 2 hours before argon-dye laser irradiation at 655 nm at the highest light dosage (54 J/cm2) produced wart regression.  Total wart regression without recurrence was achieved with Photofrin II and the gold vapor laser at all light dosages.  The difference observed between the argon-dye laser and the gold vapor laser might be explained by the pulsed nature of the gold vapor laser, with its high-peak powers, some 5000 x the average measured light dose.  In this model, the smaller, less cornified lesions were more effectively treated with photodynamic therapy.',
         'C02'),
        ('Role of the monocyte-macrophage in influenza virus infection of lymphocytes: implications for HIV infection.  Knowledge of the pathogenesis of viruses which are less virulent than human immunodeficiency virus (HIV) may provide valuable insights into the pathogenesis of HIV infection.  Influenza virus, an enveloped RNA virus, infects monocyte-macrophages, although the infection is brief and abortive.  Isolated purified lymphocytes are completely resistant to infection.  In contrast, mixtures of lymphocytes and macrophages can synthesize all virus proteins.  Infection requires physical association of monocyte-macrophages and lymphocytes in "clusters." These studies with influenza virus suggest that the pathogenesis of virus infections in mixed cell cultures may be very different from that observed in purified cell populations, and they suggest that similar studies should be performed with HIV.',
         'C01'),
        ('Use of polymerase chain reaction for successful identification of asymptomatic genital infection with herpes simplex virus in pregnant women at delivery.  The polymerase chain reaction was adapted to the amplification of a herpes simplex virus (HSV) DNA sequence, common to HSV types 1 and 2 (HSV-1, HSV-2).  The amplified product was detectable by ethidium-bromide staining or Southern hybridization of gels and by dot hybridization.  The HSV polymerase chain reaction detected HSV DNA in samples obtained from eight patients with genital lesions from which HSV-2 was isolated in tissue culture and from four patients with labial lesions from which HSV-1 was isolated.  The HSV polymerase chain reaction identified HSV in clinical specimens obtained from 11 women who had asymptomatic genital HSV infections at delivery.  None of 11 samples obtained at delivery from women who had antibodies to HSV-2, but whose delivery cultures were negative, were positive by polymerase chain reaction and no false-positive reactions were obtained when the reaction mixture contained human cell DNA or varicella-zoster virus, cytomegalovirus, Epstein-Barr virus, or human papillomavirus DNA.',
         'C02')
    ]
    classifier = NaiveBayesClassifier(trainData)
    #str1 = "A school blood drive before a measles outbreak permitted correlation of preexposure measles antibody titers with clinical protection using the plaque reduction neutralization (PRN) test and an EIA."
    # print(classifier.classify(str1))
    testdata = [
        'A school blood drive before a measles outbreak permitted correlation of preexposure measles antibody titers with clinical protection using the plaque reduction neutralization (PRN) test and an EIA.',
        'Of 9 donors with detectable preexposure PRN titer less than or equal to 120, 8 met the clinical criteria for measles (7 seroconfirmed) compared with none of 71 with preexposure PRN titers greater than 120 (P less than .0001).',
        'Seven of 11 donors with preexposure PRN titers of 216-874 had a greater than or equal to 4-fold rise in antibody titer (mean, 43-fold) compared with none of 7 with a preexposure PRN titer greater than or equal to 1052 (P less than .02).',
        'Of 37 noncases with preexposure PRN titer less than 1052, 26 (70%) reported one or more symptoms compared with 11 (31%) of 35 donors with preexposure PRN titers greater than or equal to 1052 (P less than .002).',
        'By EIA, no case had detectable preexposure antibody; the preexposure geometric mean titer of asymptomatic donors (220) was not significantly higher than that of symptomatic donors who did not meet the clinical criteria for measles (153) (P = .10).',
        'The study suggests that PRN titers less than or equal to 120 were not protective against measles disease and illness without rash due to measles may occur in persons with PRN titers above this level.',
        'Use of polymerase chain reaction for successful identification of asymptomatic genital infection with herpes simplex virus in pregnant women at delivery.  The polymerase chain reaction was adapted to the amplification of a herpes simplex virus (HSV) DNA sequence, common to HSV types 1 and 2 (HSV-1, HSV-2).  The amplified product was detectable by ethidium-bromide staining or Southern hybridization of gels and by dot hybridization.  The HSV polymerase chain reaction detected HSV DNA in samples obtained from eight patients with genital lesions from which HSV-2 was isolated in tissue culture and from four patients with labial lesions from which HSV-1 was isolated.  The HSV polymerase chain reaction identified HSV in clinical specimens obtained from 11 women who had asymptomatic genital HSV infections at delivery.  None of 11 samples obtained at delivery from women who had antibodies to HSV-2, but whose delivery cultures were negative, were positive by polymerase chain reaction and no false-positive reactions were obtained when the reaction mixture contained human cell DNA or varicella-zoster virus, cytomegalovirus, Epstein-Barr virus, or human papillomavirus DN'
    ]
    for stmt in testdata:
        print(classifier.classify(stmt))

コード例 #11

0

ファイルを表示

ファイル: task2_05.py プロジェクト: Jayharer/twitter_data_mining

def decide_actionable_tweet(doc_standard):
 actionable_tweet = []
 from textblob.classifiers import NaiveBayesClassifier as NBC
 from textblob import TextBlob
 training_corpus = [ ('naredra modi is good politician','not_actionable'),
                    ('how congress become good oppositor','actionable'),
                    ('python is popular language','not_actionable'),
                    ('here is new version of python available see it','actionable'),
                    ('retweet why india is poor country','actionable'),
                    ('Pro cubbadi startion on 1 august 2017 ','not_actionable'),
                    ('book ticket for goa at reasonable cost','actinable')]

 test_corpus = [('here is new version of motorola see it','actionable'),
               ('hellow friends how are you','not_actionable')]

 model = NBC(training_corpus)

 print("model",model)
 try:
  for doc in doc_standard:         # for testing use other list instead of doc_standard
    result = model.classify(doc)

    if result is 'actionable':
        actionable_tweet.append(doc)
 except:
    print("error in classify")

 print("actionable_tweet", actionable_tweet)
 return actionable_tweet

コード例 #12

0

ファイルを表示

ファイル: testing2.py プロジェクト: bhargav1000/emotion-perception-using-reinforcement-learning

    def classify_text(self):

        cl = NaiveBayesClassifier(self.train)

        result = cl.classify("love sandwich!")

        print(result)

コード例 #13

0

ファイルを表示

class Model(object):
    """docstring for Model"""
    def __init__(self, name='Guess', config={}):
        self.name = name
        self.config = config
        self.clf = NaiveBayesClassifier([])

    def train(self, training_data):

        safe_training = []

        for example in training_data:
            safe_training.append((example.get('text'), example.get('label')))

        self.clf.update(safe_training)

    def evaluate(self, text):
        label = self.clf.classify(text)
        prob_dist = self.clf.prob_classify(text)
        label_prob = prob_dist.prob(label)
        return label, label_prob

    def get_classes(self):
        return self.clf.labels()

    def save(self):
        pass

    def load(self):
        pass

コード例 #14

0

ファイルを表示

ファイル: guy.py プロジェクト: ppsdatta/al

    class AL_Guy:
        "Secret inner class"

        def __init__(self):
            self.classifier = None
            self.action_map = dict()
            self.reveries = None
            self.jokes = None
            self.training_data = None
            self.converted_training_data = []

        def train(self, training_data):
            self.training_data = training_data
            for k in self.training_data.keys():
                tag = k
                tag_data = self.training_data[tag]
                self.action_map[tag] = tag_data['action']
                for d in tag_data['training_data']:
                    self.converted_training_data.append((d, tag))
            self.classifier = NaiveBayesClassifier(
                self.converted_training_data
            )  #, feature_extractor=cmd_extractor)

        def respond(self, data):
            action_class = self.classifier.classify(data)
            #self.classifier.show_informative_features(5)
            if action_class in self.action_map:
                return self.action_map[action_class](data)

            return 'Don\'t look like anything to me.... (Let me call Bernard, head of programming WW)'

コード例 #15

0

ファイルを表示

ファイル: views.py プロジェクト: thasniabbas55/stepone

def view_video(request, id=0):
    person = last_login.objects.all().last()
    id1 = User.objects.get(username=person.username)
    videos = Video.objects.get(id=id)
    context = {'video': videos}
    df = pd.read_csv("data/feedback.csv", header=0, encoding='unicode_escape')
    df = df.dropna()
    data = []
    for index, rows in df.iterrows():
        a = (rows['statement'], rows['overview'])
        data.append(a)
    cl = NaiveBayesClassifier(data)
    if request.method == 'POST':
        form = feedback_Form(request.POST)
        if form.is_valid():
            feedback = form.cleaned_data['feedback']
            pred1 = cl.classify(feedback)
            g = feedback_model(feedback=pred1)
            g.save()
            h = user_feedback(username=id1,
                              feedback=feedback_model.objects.all().last(),
                              Date=datetime.datetime.today(),
                              video=videos)
            h.save()
            context = {'video': videos, 'msg': 'successfully stored'}
            return render(request, 'video.html', context)
    return render(request, 'video.html', context)

コード例 #16

0

ファイルを表示

ファイル: app.py プロジェクト: JesusGuzman/TwTSentimentPolitics

def bayes():
    train = [
        ('Extraordinaria reunion Al Gore y AMLO , me dio un gusto enorme conocer sus profundas coincidencias sobre Accion Climatica',
         'pos'), ('AMLO fue el unico que estuvo a la altura.', 'pos'),
        ('AMLO a sabido ganarse nuestra confianza y por eso ahora somos muchos los que lo apoyamos. ',
         'pos'),
        ('No hay otra mejor opcion en Mexico que AMLO. Los demas partidos han demostrado que trabajan para su bolsillo ',
         'pos'),
        ('Al final queda la imagen de un AMLO prudente, inteligente ', 'pos'),
        ('VOTO MASIVO POR  AMLO  aun que les arda ', 'pos'),
        ('Arriba papa AMLO ', 'pos'),
        ('prometio trabajar junto con algore para hacer frente al cambio climatico',
         'pos'),
        ('Hasta Mitofsky senala que AMLO continua creciendo, Anaya y Meade van a la baja',
         'pos'),
        ('Segun El Financiero, AMLO continua siendo el preferido', 'pos'),
        ('convence a mas mexicanos y despega 4 puntos arriba en las encuestas. Inalcanzable',
         'pos'),
        ('Lopitos,somos 70% de Mexicanos q no te creemos.Enganas a medio mundo.Tienes genes de dictador despistado.',
         'neg'), ('Las contradicciones de AMLO', 'neg'),
        ('Refuta Penia a AMLO; defiende reforma energetica', 'neg'),
        ('hablan mucho de Suecia y Noriega... pero quieren votar por el que le mira hacia Venezuela',
         'neg'), ('las propuestas de AMLO me dan pesadillas', 'pos'),
        ('El video viral que muestra la ignorancia de AMLO esta manipulado',
         'neg')
    ]
    cl = NaiveBayesClassifier(train)
    print(
        cl.classify("Las propuestas de AMLO parecieron razonables y viables"))

    return "OK", 200

コード例 #17

0

ファイルを表示

ファイル: classify_filetweet.py プロジェクト: shnnxn/Twitter-Classifier

    def classify_r1(self):

        lists = first_level  #main categories
        cfy = NaiveBayesClassifier(lists)
        tweet = self.get_tweet()
        one = cfy.classify(tweet)
        return one

コード例 #18

0

ファイルを表示

ファイル: get_analysis.py プロジェクト: harshvb7/twitter_analysis

def get_analysis(s):

    train = [
        ('I love this sandwich.', 'pos'),
        ('This is an amazing place!', 'pos'),
        ('I feel very good about these beers.', 'pos'),
        ('This is my best work.', 'pos'),
        ("What an awesome view", 'pos'),
        ('I do not like this restaurant', 'neg'),
        ('I am tired of this stuff.', 'neg'),
        ("I can't deal with this", 'neg'),
        ('He is my sworn enemy!', 'neg'),
        ('My boss is horrible.', 'neg')
    ]


    cl = NaiveBayesClassifier(train)

    tweets = Tweet.objects.filter(search_term = s)

    result = []

    for t in tweets:
        d = {}
        c = cl.classify(t.tw_text)
        d['text'] = t.tw_text
        d['res'] = c
        result.append(d)

    return result

コード例 #19

0

ファイルを表示

ファイル: HelloPeterSentiments.py プロジェクト: dlunga/serViz

def main():
    data =[]
    train =[]
    test =[] 
    with open('hellopeter_labelled.csv', 'rb') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        spamreader = list(spamreader)
        for row in spamreader:
            if (row[13] =='strongly positive'): 
                data.append((row[8],'pos'))
            if (row[13] =='positive' ): 
                data.append((row[8],'pos'))
            if ( row[13] =='neutral' ): 
                data.append((row[8],'neu'))
            if ( row[13] =='negative'): 
                data.append((row[8],'neg'))
            if (row[13] =='strongly negative' ): 
                data.append((row[8],'neg'))
                
                
    train = data[:1000]
    test = data[1001:]
    
    for innf in test:
        print innf
            
    cl = NaiveBayesClassifier(train)
   
    for tnew in test: 
            print '%%%%%%%'
            print ' '
            print  tnew[0]
            print  tnew[1]
            print '%%%%%%%'
            print '#######'
            cl.classify(tnew[0])
            prob_class =  cl.prob_classify(tnew[0])
            print '----max prob---'
            print prob_class.max()
            print '-----+ve-----'
            print prob_class.prob("pos")
            print '-----neutral-----'
            print prob_class.prob("neu")
            print '------ve-----'
            print prob_class.prob("neg")
            
    cl.accuracy(test)

コード例 #20

0

ファイルを表示

ファイル: HelloPeterSentiments.py プロジェクト: dlunga/serViz

def main():
    data = []
    train = []
    test = []
    with open('hellopeter_labelled.csv', 'rb') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        spamreader = list(spamreader)
        for row in spamreader:
            if (row[13] == 'strongly positive'):
                data.append((row[8], 'pos'))
            if (row[13] == 'positive'):
                data.append((row[8], 'pos'))
            if (row[13] == 'neutral'):
                data.append((row[8], 'neu'))
            if (row[13] == 'negative'):
                data.append((row[8], 'neg'))
            if (row[13] == 'strongly negative'):
                data.append((row[8], 'neg'))

    train = data[:1000]
    test = data[1001:]

    for innf in test:
        print innf

    cl = NaiveBayesClassifier(train)

    for tnew in test:
        print '%%%%%%%'
        print ' '
        print tnew[0]
        print tnew[1]
        print '%%%%%%%'
        print '#######'
        cl.classify(tnew[0])
        prob_class = cl.prob_classify(tnew[0])
        print '----max prob---'
        print prob_class.max()
        print '-----+ve-----'
        print prob_class.prob("pos")
        print '-----neutral-----'
        print prob_class.prob("neu")
        print '------ve-----'
        print prob_class.prob("neg")

    cl.accuracy(test)

コード例 #21

0

ファイルを表示

def qaTest():
    train = [('I love this sandwich.', 'pos'),
             ('this is an amazing place!', 'pos'),
             ('I feel very good about these beers.', 'pos'),
             ('this is my best work.', 'pos'), ("what an awesome view", 'pos'),
             ('I do not like this restaurant', 'neg'),
             ('I am tired of this stuff.', 'neg'),
             ("I can't deal with this", 'neg'),
             ('he is my sworn enemy!', 'neg'), ('my boss is horrible.', 'neg')]

    test = [('the beer was good.', 'pos'), ('I do not enjoy my job', 'neg'),
            ("I ain't feeling dandy today.", 'neg'),
            ("I feel amazing!", 'pos'), ('Gary is a friend of mine.', 'pos'),
            ("I can't believe I'm doing this.", 'neg')]
    cl = NaiveBayesClassifier(train)
    for stmt in test:
        print cl.classify(stmt[0])

コード例 #22

0

ファイルを表示

ファイル: dataset.py プロジェクト: ualberta-smr/TaskOrientedDocumentation

def main():
	dataset = []
	with open(DATA_FILE) as f:
		dataset = json.load(f)['data']

	dataset = dataset[:850]

	# dividing the dataset into two pos and neg parts
	pos_all = [{'sentence': item['sentence'], 'label': 'pos'} for item in dataset if item['label'] == 1]
	neg_all = [{'sentence': item['sentence'], 'label': 'neg'} for item in dataset if item['label'] == 0]

	# building the trainset from the entire dataset
	pos_train = pos_all[:math.floor(len(pos_all)/5) * 4]
	neg_train = neg_all[:math.floor(len(neg_all)/5) * 4]
	train_set =  pos_train + neg_train

	# preparing train_set to be fed to the classifier
	train_set = [(item['sentence'], item['label']) for item in train_set]


	# preparing the test set
	pos_test = pos_all[math.floor(len(pos_all)/5) * 4:]
	neg_test = neg_all[math.floor(len(neg_all)/5) * 4:]
	test_set = pos_test + neg_test


	print("Train set: {}, Pos train: {}, Neg train : {}".format(len(train_set), len(pos_train), len(neg_train)))
	print("Test set: {}, Pos test: {}, Neg test: {}".format(len(test_set), len(pos_test), len(neg_test)))

	#wpdb.set_trace()


	# training the classifier
	model = NaiveBayesClassifier(train_set)

	correct = 0
	TP = 0
	TN = 0
	FP = 0
	FN = 0


	for item in test_set:
		classification = model.classify(item['sentence'])
		if classification == item['label']:
			if classification == "pos":
				TP += 1
			else:
				TN += 1

		else:
			if classification == "pos":
				FP += 1
			else:
				FN += 1


	calculate_performace(TP, FP, TN, FN)

コード例 #23

0

ファイルを表示

ファイル: train.py プロジェクト: dnaport22/dodgybot

class DodgyBot():

    __msg = None

    def __init__(self, ResponseHandler):
        self.recogniser = sr.Recognizer()
        self.microphone = sr.Microphone()
        self.handler = ResponseHandler
        self.chatbot = None
        self.loadIntent()

    def listen(self, listen_freq=0):
        while listen_freq == 0:
            with self.microphone as source:

                self.audio = self.recogniser.listen(source)

            return True

    def recogniseAudio(self):
        print 'Recognising...'
        response = self.recogniser.recognize_google(self.audio)

        return self.setMessage(response)

    def loadIntent(self):
        # self.chatbot = ChatBot(
        #   'Dodgy Bot',
        #   trainer= 'chatterbot.trainers.ChatterBotCorpusTrainer'
        # )
        # self.chatbot.train("chatterbot.corpus.english")

        with open("intent_data.json", "r") as fp:
            self.cl = NaiveBayesClassifier(fp, format="json")

    def getIntent(self):
        return self.cl.classify(self.getMessage())

    def loadResponse(self, intent):
        if (intent == "greeting"):
            return os.system("say '%s'" %
                             (self.handler.greetingHandler(self.getMessage())))
        if (intent == "question"):
            return os.system("say '%s'" %
                             (self.handler.questionHandler(self.getMessage())))

        return os.system("say '%s'" %
                         (self.handler.unknownResponse(self.getMessage())))

    def setMessage(self, msg):
        DodgyBot.__msg = msg

    def getMessage(self):
        return DodgyBot.__msg

    def getChatBot(self):
        return self.chatbot

コード例 #24

0

ファイルを表示

ファイル: twitter_analysis.py プロジェクト: rajputrohitrjit/BTECH_IT_2019Project

    def get_tweet_sentiment(self, tweet):
        '''
        Utility function to classify sentiment of passed tweet
        using textblob's sentiment method
        '''
        # create TextBlob object of passed tweet text

        analysis = self.clean_tweet(tweet)
        # set sentiment
        with open('train.json', 'r') as fp:
            cl = NaiveBayesClassifier(fp, format="json")
        # set sentiment
        if cl.classify(analysis) == "Pos":
            return 'positive'
        elif cl.classify(analysis) == "Neg":
            return 'negative'
        else:
            return 'neutral'

コード例 #25

0

ファイルを表示

ファイル: time_adapter.py プロジェクト: machinelliummavericks/chatbot-hibot

class TimeLogicAdapter(LogicAdapter):
    """
    The TimeLogicAdapter returns the current time.
    """

    def __init__(self, **kwargs):
        super(TimeLogicAdapter, self).__init__(**kwargs)

        training_data = [
            ("what time is it", 1),
            ("do you know the time", 1),
            ("do you know what time it is", 1),
            ("what is the time", 1),
            ("do you know the time", 0),
            ("it is time to go to sleep", 0),
            ("what is your favorite color", 0),
            ("i had a great time", 0),
            ("what is", 0)
        ]

        self.classifier = NaiveBayesClassifier(training_data)

    def process(self, statement, tag_processing = None):

        user_input = statement.text.lower()
        if "time" not in user_input:
            return 0, Statement("")

        try:
            # Find the time zone of the user based on latitude and longitude to get the correct time
            g          = geocoders.GoogleV3()
            user       = tag_processing.user
            lat,lon    = user.get_latitude_longitude()
            timezone   = g.timezone((lat,lon))

            now = datetime.now(timezone)

            confidence = self.classifier.classify(user_input)
            response = Statement("The current time is " + now.strftime("%I:%M %p"))
        except:
            confidence = self.classifier.classify(user_input)
            response = Statement("Sorry. I cannot find the current time. Possible bad user location based on latitude and longitude. Please try again later")

        return confidence, response

コード例 #26

0

ファイルを表示

def classify(desc):
    try:
        fp = open(os.path.join(os.path.abspath('.'), 'classifier/train.json'),
                  'r')
    except Exception:
        fp = open('train.json', 'r')
    cl = NaiveBayesClassifier(fp, format='json')
    classification = cl.classify(desc)

    return classification

コード例 #27

0

ファイルを表示

ファイル: sentiment.py プロジェクト: ayhun/MMDS

class NaiveBayesAnalyzer:
    cl = None

    def __init__(self):
        with open("training_data.json", "r") as f:
            self.cl = NaiveBayesClassifier(f, format="json")
        self.cl.show_informative_features(20)

    def analyze(self, text):
        return self.cl.classify(text)

コード例 #28

0

ファイルを表示

ファイル: classify_filetweet.py プロジェクト: shnnxn/Twitter-Classifier

    def classify_r2(self, one):  #second level

        if (one == 'EdTech'):
            lists = Edtech
            cfy = NaiveBayesClassifier(lists)
            tweet = self.get_tweet()
            two = cfy.classify(tweet)
            return two
        elif (one == 'Leisure'):
            lists = Leisure
            cfy = NaiveBayesClassifier(lists)
            tweet = self.get_tweet()
            two = cfy.classify(tweet)
            return two
        else:
            lists = Places
            cfy = NaiveBayesClassifier(lists)
            tweet = self.get_tweet()
            two = cfy.classify(tweet)
            return two

コード例 #29

0

ファイルを表示

 def classify_intent(self):
     with open(self.file, 'r') as fp:
         cl = NaiveBayesClassifier(fp, format='json')
     intent = cl.classify(self.utterance)
     test = (self.utterance, intent),
     accuracy = cl.accuracy(test)
     # print(accuracy, '<< Accuracy')
     # nb_clf_result = {
     #     'intent': intent,
     #     'accuracy': accuracy
     # }
     return intent

コード例 #30

0

ファイルを表示

    def classify_r1(self):

        lists = first_level

        cfy = NaiveBayesClassifier(lists)
        tfile = open("tweets.txt", 'r')
        for line in tfile:
            tweet = line.strip()
        one = cfy.classify(tweet)

        tfile.close()

        return one

コード例 #31

0

ファイルを表示

ファイル: app.py プロジェクト: Eshwar0601/fake-news-classifier-api

def respond():
    train = [
        ('Says the Annies List political group supports third-trimester abortions on demand.',
         'false'),
        ('Donald Trump is against marriage equality. He wants to go back.',
         'true'), ('Says nearly half of Oregons children are poor.', 'true'),
        ('State revenue projections have missed the mark month after month.',
         'true'),
        ("In the month of January, Canada created more new jobs than we did.",
         'true'),
        ('If people work and make more money, they lose more in benefits than they would earn in salary.',
         'false'),
        ('Originally, Democrats promised that if you liked your health care plan, you could keep it. One year later we know that you need a waiver to keep your plan.',
         'false'),
        ("We spend more money on antacids than we do on politics.", 'false'),
        ('Barack Obama and Joe Biden oppose new drilling at home and oppose nuclear power.',
         'false'),
        ('President Obama once said he wants everybody in America to go to college.',
         'false')
    ]
    test = [
        ('Because of the steps we took, there are about 2 million Americans working right now who would otherwise be unemployed.',
         'true'),
        ('Scientists project that the Arctic will be ice-free in the summer of 2018',
         'false'),
        ("You cannot build a little guy up by tearing a big guy down -- Abraham Lincoln said it.",
         'false'),
        ("One man opposed a flawed strategy in Iraq. One man had the courage to call for change. One man didn't play politics with the truth.",
         'true'),
        ('When I was governor, not only did test scores improve we also narrowed the achievement gap.',
         'true'),
        ("Ukraine was a nuclear-armed state. They gave away their nuclear arms with the understanding that we would protect them.",
         'false')
    ]
    cl = NaiveBayesClassifier(train)
    print("your test accuracy is ", cl.accuracy(test))
    # Retrieve the message from url parameter
    message = request.args.get("message", None)
    response = {}
    # Check if user sent a name at all
    if not message:
        response["ERROR"] = "no user input found, please send a message."
    # Now the user entered a valid name
    else:
        classified_text = cl.classify(message)

    #     response["result"] = {f" The Sentence {message} is {classified_text} and the accuracy is {messageAccuracy}"

    # # Return the response in json format
    # return jsonify(response)
    return jsonify({"Message": f"{message}", "result": f"{classified_text}"})

コード例 #32

0

ファイルを表示

ファイル: tweet_trend.py プロジェクト: anirudh24/self_aware_bot

class TwitterTrendAdapter(LogicAdapter):
    def __init__(self, **kwargs):
        super(TwitterTrendAdapter, self).__init__(**kwargs)

        training_data = [
            ("what's trending in ", 1),
            ('what is trending in', 1),
            ('what is', 0),
            ('who is', 0),
            ('who was', 0),
            ('what can you tell me about', 0),
            ('what do you know about', 0),
            ('any clue about', 0),
            ('where is',0),
            ('located', 0),
            ('what is happening', 1)
        ]

        self.classifier = NaiveBayesClassifier(training_data)

    def process(self, statement):
        confidence = self.classifier.classify(statement.text.lower())
        tokens = nltk.word_tokenize(str(statement))
        tagged = nltk.pos_tag(tokens)
        nouns = [word for word, pos in tagged if (pos == 'NN' or pos == 'NNP' or pos =='JJ' or pos == 'NNS' or pos == 'NNPS')]
        auth = OAuthHandler(twitter_consumer_key, twitter_consumer_secret)
        auth.set_access_token(twitter_access_key, twitter_access_secret)
        api = tweepy.API(auth)
        trendsName = ""
        for noun in nouns:
            try:
                html = urllib.urlopen(
                    'http://where.yahooapis.com/v1/places.q(' + noun + ')?appid=' + yahoo_client_Id).read()
                soup = BeautifulSoup(html, 'html.parser')
                woeids = soup.find('woeid').contents
                for woeid in woeids:
                    id = ' '.join(woeid.string.split())
                    trends1 = api.trends_place(str(id))
                    data = trends1[0]
                    # grab the trends
                    trends = data['trends']
                    names1 = [trend['name'] for trend in trends]
                    trendsName += ' '.join(names1)
            except:
                pass
        if len(nouns) != 0 and len(trendsName)!=0:
            response = Statement("Jarvis: "+trendsName)
        else:
            response = Statement("")
            confidence=0
        return confidence, response

コード例 #33

0

ファイルを表示

    def classify_r2(self, one):
        if (one == 'Technology'):

            lists = Technology

            cfy = NaiveBayesClassifier(lists)
            tfile = open("tweets.txt", 'r')
            for line in tfile:
                tweet = line.strip()
            two = cfy.classify(tweet)

            tfile.close()

            return two

コード例 #34

0

ファイルを表示

ファイル: Algorithms.py プロジェクト: jorgejmt94/TFG

def text_classification_with_naive_bayes(text):
    from textblob.classifiers import NaiveBayesClassifier
    #key words
    dictionary = DB.GET_dictionary_from_DB()
    train = []

    for type in dictionary:
        for word in type.key_words:
            to_add = (word.lower(), Utils.get_data_id_lower(type.type_name))
            train.append(to_add)
    cl = NaiveBayesClassifier(train)

    result = cl.classify(text.lower())
    print('Según key words:', Utils.get_data_name(result))

    #prob_dist = cl.prob_classify(0)
    # prob_dist.max()
    # print(round(prob_dist.prob(0), 12))

    #secondary words
    dictionary = DB.GET_dictionary_from_DB()
    train = []
    to_add_list = []
    aux = []
    import random
    #en las secondary words cogeremos un valor aleatorio de palabras para equilibrar (sino siempre dice que es futbol)
    for type in dictionary:
        for word in type.secondary_words:
            to_add_list.append((word.lower(), type.type_name))
        aux = random.sample(to_add_list, 50)  #cogemos 50 al azar
        to_add_list = []
        for add in aux:
            train.append(add)

    cl = NaiveBayesClassifier(train)
    result = cl.classify(text.lower())
    print('Según secondary words:', result)

コード例 #35

0

ファイルを表示

ファイル: testing_naive.py プロジェクト: Mangesh242/InstituteAssistant

def classify_v1(text):
    #<str> is passed to func
    text = bc.basic_cleanning(text)  #returned value is in <list> format
    #print(text)
    if text != []:
        with open('train_dataset.csv') as csv_file:
            cl = NaiveBayesClassifier(csv_file, format="csv")
            #cl = NaiveBayesClassifier() #pass dataset as list
            result = cl.classify(text)
            #print (type(result))  # <str> format
            prob_dist = cl.prob_classify(text)
            pos_result = round(prob_dist.prob("pos"), 2)
            neg_result = round(prob_dist.prob("neg"), 2)

            return result

コード例 #36

0

ファイルを表示

def textblob_naivebayes(train_data, test_data, classify_filter):
    cl = NaiveBayesClassifier(train_data)
    print(
        "\nF1 score:",
        metrics.classification_report(
            [i[1] for i in test_data], [cl.classify(i[0]) for i in test_data]
            # , pos_label="account_api"
            # , average=None
        ))

    classify_data = classify_filter['SUBJECT'].unique()
    temp1 = []
    temp2 = []
    for i in classify_data:
        # print(i, cl.classify(i))
        temp1.append(i)
        temp2.append(cl.classify(i))

    classify_dataframe = pandas.DataFrame(
        numpy.column_stack([temp1, temp2]),
        columns=['subject', 'predicted_label'])

    with open('classifier.csv', 'w') as f:
        classify_dataframe.to_csv(f)

コード例 #37

0

ファイルを表示

ファイル: libtraffic.py プロジェクト: ferdhika31/smarttraffic

def klasify(text):
    hasil = 'mct'
    train = [
        ('lancar', 'lcr'),
        ('peningkatan volume kendaraan', 'mct'),
        ('ramai', 'mct'),
        ('normal', 'lcr'),
        ('padat merayap', 'mct'),
        ('meriah', 'mct'),
        ('antrian', 'mct'),
        ('penyempitan', 'mct'),
        ('sepi', 'lcr'),
        ('gabisa lewat', 'mct'),
        ('macet', 'mct'),
    ]
    cl = NaiveBayesClassifier(train)
    cl.classify("macet")
    'mct'
    blob = TextBlob(text, classifier=cl)
    for s in blob.sentences:
        hasil = s.classify()
        # print(s.classify())

    return hasil

コード例 #38

0

ファイルを表示

ファイル: tweet_tag.py プロジェクト: anirudh24/self_aware_bot

class TwitterTagAdapter(LogicAdapter):
    def __init__(self, **kwargs):
        super(TwitterTagAdapter, self).__init__(**kwargs)

        training_data = [
            ('what are people talking about', 1),
            ("what's trending in", 0),
            ('what is going on with', 1),
            ('what are reviews', 1),
            ('what is going on',1),
            ('tweetind',1),
            ('what can you tell me about', 0),
            ('what do you know about', 0),
            ('any clue about', 0),
            ('where is',0),
            ('located', 0),
            ('what is happening', 0)
        ]

        self.classifier = NaiveBayesClassifier(training_data)

    def process(self, statement):
        confidence = self.classifier.classify(statement.text.lower())
        tokens = nltk.word_tokenize(str(statement))
        tagged = nltk.pos_tag(tokens)
        nouns = [word for word, pos in tagged if
                 (pos == 'NN' or pos == 'NNP' or pos =='JJ' or pos == 'NNS' or pos == 'NNPS')]
        downcased = [x.lower() for x in nouns]
        searchTerm = " ".join(downcased).encode('utf-8')
        #"http://where.yahooapis.com/v1/places.q('Place name')?appid=yourappidhere"
        st=""
        if len(nouns) != 0:
            auth = OAuthHandler(twitter_consumer_key, twitter_consumer_secret)
            auth.set_access_token(twitter_access_key, twitter_access_secret)

            api = tweepy.API(auth)
            for status in tweepy.Cursor(api.search, q='#'+searchTerm).items(20):
                st = st+status.text
            response = Statement("Jarvis: "+st)
        else:
            response = Statement("Jarvis: "+"Sorry sir, Nothing Found")
        return confidence, response


#what's trending in city
#movie reviews
#people talking about some topic

コード例 #39

0

ファイルを表示

ファイル: scrap.py プロジェクト: InSertCod3/Hacker-News-Scrapper

class Scraper ():
    def __init__(self, traing_data):
        self.cl = NaiveBayesClassifier(traing_data)

    def classifier(self, data):
        return self.cl.classify(data)

    def fetch_data(self):
        BASEURL = "https://news.ycombinator.com/news?p="
        for n in range(1):
            r = requests.get(BASEURL + str(n))
            soup = BeautifulSoup(r.content, "html.parser")
            for title in soup.findAll('tr', {'class': 'athing'}):  # Fetch Title
                for t in title.findAll('a', text=True):
                    art_title = t.text.encode("utf8")
                    art_link = t['href']
                    print (self.classifier(art_title), art_title)

コード例 #40

0

ファイルを表示

ファイル: core.py プロジェクト: Axilent/talkback

class App(DictMixin):
    """ 
    Aggregation of intents.
    """
    def __init__(self,name,greeting):
        self.name = name
        self.greeting = greeting
        self.intents = {}
        self.classifier = None
    
    def __getitem__(self,key):
        return self.intents[key]
    
    def __setitem__(self,key,value):
        self.intents[key] = value
        
        # train classifier
        phrase_file = file(value.phrases,'r')
        phrase_data = yaml.safe_load(phrase_file)
        phrases = [(phrase,value.name) for phrase in phrase_data['Phrases']]
        
        if self.classifier:
            self.classifier.update(phrases)
        else:
            self.classifier = Classifier(phrases)
    
    def __delitem__(self,key):
        del self.intents[key]
    
    def keys(self):
        return self.intents.keys()
    
    def intent_for(self,phrase):
        """ 
        Attempt to match an intent to the supplied phrase, using the onboard classifier.
        """
        if not self.classifier:
            # has not been properly initializes
            raise IntentNotFound('Classifier not initialized')
        
        try:
            return self.intents[self.classifier.classify(phrase)]
        except KeyError:
            raise IntentNotFound

コード例 #41

0

ファイルを表示

ファイル: util.py プロジェクト: mkumar87/DartsVDSIHackathon

def nayebayesreport(fileFullPath):
    print  "nayebayesreport came"
    print (fileFullPath)
    sentimentDtls = []
    patternCountMap = {
                       "Negative" : 0,
                       "Positive" : 0,
                       "Neutral" : 0,
                       "Total" : 0,
                       }
    
    
    cl = NaiveBayesClassifier(getTrainData())

    print "train data loaded"
    with open(fileFullPath, 'r') as f:
        for line in f:
            try:
                print line
                if line and len(line.strip()) > 0:
                    trainedResult = cl.classify(line)
                        
                    patternResult = "Negative"
                    if "pos" == trainedResult:
                        patternResult = "Positive"
                    
                    patternCountMap[patternResult] = patternCountMap[patternResult] + 1
                    patternCountMap["Total"] = patternCountMap["Total"] + 1
                    
                    sentimentDtls.append({
                                          "sentiment" : patternResult,
                                          "feedback" : line
                                         })
            except Exception:
                print(traceback.format_exc())
                print(line)
    
    addBayesClassifierResult(sentimentDtls)
    return

コード例 #42

0

ファイルを表示

ファイル: Classifier.py プロジェクト: yjlo123/CViA

class Classifier:
    def __init__(self):
        self.cachedStopWords = stopwords.words("english")
        self.path = os.path.dirname(os.path.abspath(__file__))

    def train(self, train_set):
        train_data = []
        for t in train_set:
            train_data.append((self._cvobj_to_string(t[0]),t[1]))
        print "Training model..."
        #print train_data
        self.cl = NaiveBayesClassifier(train_data)
        #print self._cvobj_to_string(train_set[0][0])

    def _cvobj_to_string(self, cv):
        str = ""
        for exp in cv['experience']:
            str += (exp['description']+" ")
        for proj in cv['project']:
            str += (proj['title']+" ")
            str += (proj['description']+" ")
        for skill in cv['skill']:
            str += (skill+" ")
        str = str.decode("utf-8", "replace")
        str = ' '.join([word for word in str.split() if word not in self.cachedStopWords])
        return str

    def classify(self, cv):
        return self.cl.classify(self._cvobj_to_string(cv))

    def save(self):
        pickle.dump( self.cl, open( self.path+"/cv_model.cvm", "wb" ) )
        print "CV classifier saved."

    def load(self):
        self.cl = pickle.load( open( self.path+"/cv_model.cvm", "rb" ) )
        print "CV classifier loaded."

コード例 #43

0

ファイルを表示

ファイル: events.py プロジェクト: nimmaj/sentiment-trading

def NaiveBayesAnalyzerParser(text):
    
    train =[('creates jobs', 'pos'),
            ('create jobs', 'pos'),
            ('created jobs', 'pos'),
            ('new jobs', 'pos'),
            ('jobs wanted', 'pos'),
            ('jobs needed', 'pos'),
            ('jobs call by', 'pos'),
            ('unemployment falls', 'pos'),
            ('bring jobs', 'pos'),
            ('jobs comming', 'pos'),
            ('unemployment drops', 'pos'),
            ('cut jobs', 'neg'),
            ('cutting jobs', 'neg'),
            ('cuts jobs', 'neg'),
            ('lost jobs', 'neg'),
            ('job loss', 'neg'),
            ('losing jobs', 'neg'),
            ('lose jobs', 'neg'),
            ('jobs not kept', 'neg'),
            ('jobs trim', 'neg'),
            ('unemployment rises', 'neg'),
            ('drops', 'neg'),
            ('drop', 'neg'),
            ('dollar falls', 'neg'),
        ]
    cl = NaiveBayesClassifier(train)
    sentiment = TextBlob(text, analyzer=NaiveBayesAnalyzer()).sentiment
    #Sentiment(classification='pos', p_pos=0.6023632501327671, p_neg=0.3976367498672331)
    #print(sentiment)
    subjectivity = 1 - (max(sentiment.p_pos,sentiment.p_neg) - min(sentiment.p_pos,sentiment.p_neg))
    if cl.classify(text) == 'pos':
        return (sentiment.p_pos, subjectivity)
    else:
        return (sentiment.p_neg*-1, subjectivity)

コード例 #44

0

ファイルを表示

ファイル: test_classifiers.py プロジェクト: Anhmike/TextBlob

class TestNaiveBayesClassifier(unittest.TestCase):

    def setUp(self):
        self.classifier = NaiveBayesClassifier(train_set)

    def test_default_extractor(self):
        text = "I feel happy this morning."
        assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set))

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_classify_a_list_of_words(self):
        res = self.classifier.classify(["I", "feel", "happy", "this", "morning"])
        assert_equal(res, "positive")

    def test_train_from_lists_of_words(self):
        # classifier can be trained on lists of words instead of strings
        train = [(doc.split(), label) for doc, label in train_set]
        classifier = NaiveBayesClassifier(train)
        assert_equal(classifier.accuracy(test_set),
                        self.classifier.accuracy(test_set))

    def test_prob_classify(self):
        res = self.classifier.prob_classify("I feel happy this morning")
        assert_equal(res.max(), "positive")
        assert_true(res.prob("positive") > res.prob("negative"))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        res1 = self.classifier.prob_classify("lorem ipsum")
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        res2 = self.classifier.prob_classify("lorem ipsum")
        assert_true(res2.prob("positive") > res1.prob("positive"))
        assert_equal(original_length + 1, new_length)

    def test_labels(self):
        labels = self.classifier.labels()
        assert_true("positive" in labels)
        assert_true("negative" in labels)

    def test_show_informative_features(self):
        feats = self.classifier.show_informative_features()

    def test_informative_features(self):
        feats = self.classifier.informative_features(3)
        assert_true(isinstance(feats, list))
        assert_true(isinstance(feats[0], tuple))

    def test_custom_feature_extractor(self):
        cl = NaiveBayesClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_init_with_csv_file(self):
        with open(CSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp, format="csv")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_csv_file_without_format_specifier(self):
        with open(CSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file(self):
        with open(JSON_FILE) as fp:
            cl = NaiveBayesClassifier(fp, format="json")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file_without_format_specifier(self):
        with open(JSON_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_custom_format(self):
        redis_train = [('I like turtles', 'pos'), ('I hate turtles', 'neg')]

        class MockRedisFormat(formats.BaseFormat):
            def __init__(self, client, port):
                self.client = client
                self.port = port

            @classmethod
            def detect(cls, stream):
                return True

            def to_iterable(self):
                return redis_train

        formats.register('redis', MockRedisFormat)
        mock_redis = mock.Mock()
        cl = NaiveBayesClassifier(mock_redis, format='redis', port=1234)
        assert_equal(cl.train_set, redis_train)

    def test_data_with_no_available_format(self):
        mock_fp = mock.Mock()
        mock_fp.read.return_value = ''

        assert_raises(FormatError, lambda: NaiveBayesClassifier(mock_fp))

    def test_accuracy_on_a_csv_file(self):
        with open(CSV_FILE) as fp:
            a = self.classifier.accuracy(fp)
        assert_equal(type(a), float)

    def test_accuracy_on_json_file(self):
        with open(CSV_FILE) as fp:
            a = self.classifier.accuracy(fp)
        assert_equal(type(a), float)

    def test_init_with_tsv_file(self):
        with open(TSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_bad_format_specifier(self):
        assert_raises(ValueError,
            lambda: NaiveBayesClassifier(CSV_FILE, format='unknown'))

    def test_repr(self):
        assert_equal(repr(self.classifier),
            "<NaiveBayesClassifier trained on {0} instances>".format(len(train_set)))

コード例 #45

0

ファイルを表示

ファイル: classification.py プロジェクト: dshvets/EAPSI_2016

#Train classifier with training data set 
train = open("training_data.tsv",'r')
cl = NaiveBayesClassifier(train, format="tsv")
train.close()

testing = open("testing_data.txt",'r')
classify = open("classified_data.txt",'w')

for line in testing:
    line = line.rstrip('\n')
    line = line.split('\t')
    pmid = line[0]
    sent = line[1]
    try:
        classify_result = cl.classify(sent)  #classify the sentence into one of four groups
        newLine = pmid+'\t'+sent+'\t'+classify_result+'\n'
        classify.write(newLine)
    except:
        pass
    #prob_dist = cl.prob_classify(line)  Might use this later for a future network graph.
    #assoc = round(prob_dist.prob("assoc"),2) For now will just use the highest probability match for each sentence.
    #found = round(prob_dist.prob("found"),2)
    #isA = round(prob_dist.prob("is"),2)
    #involve = round(prob_dist.prob("involve"),2)
    #newLine = line+'\tAssociation\t'+str(assoc)+'\tFoundIn\t'+str(found)+'\tIsA\t'+str(isA)+'\tInvolve\t'+str(involve)+'\n'


testing.close()
classify.close()

コード例 #46

0

ファイルを表示

ファイル: basic.py プロジェクト: liorshahverdi/learning-python

from textblob.classifiers import NaiveBayesClassifier

train = [
    ('I love this sandwich.', 'pos'),
    ('This is an amazing place!', 'pos'),
    ('I feel very good about these beers.', 'pos'),
    ('This is my best work.', 'pos'),
    ("What an awesome view", 'pos'),
    ('I do not like this restaurant', 'neg'),
    ('I am tired of this stuff.', 'neg'),
    ("I can't deal with this", 'neg'),
    ('He is my sworn enemy!', 'neg'),
    ('My boss is horrible.', 'neg'),
    ("I like big butts and I cannot lie","butts")
]
test = [
    ('The beer was good.', 'pos'),
    ('I do not enjoy my job', 'neg'),
    ("I ain't feeling dandy today.", 'neg'),
    ("I feel amazing!", 'pos'),
    ('Gary is a friend of mine.', 'pos'),
    ("I can't believe I'm doing this.", 'neg')
]

cl = NaiveBayesClassifier(train)
print cl.accuracy(test)
print cl.classify("Their burgers are amazing")  # "pos"
print cl.classify("I don't hate you.")  # "neg"

コード例 #47

0

ファイルを表示

ファイル: classify_men.py プロジェクト: EricSchles/humanTraffickingTalk

        post_body = post_body.decode("utf-8")
    except UnicodeDecodeError:
        continue

    test.append([post_body, i_file])


Bayes = NaiveBayesClassifier(train)

print os.getcwd()

pos = []
neg = []
for body in test:

    judge = Bayes.classify(body[0])
    if judge == "positive":
        call(["mv", "./" + body[1], "john/"])
        os.getcwd()
    if judge == "negative":
        call(["mv", "./" + body[1], "non_john/"])

os.mkdir("hard_to_classify")
remaining = glob.glob("*.html")
for doc in remaining:

    call(["mv", "./" + doc, "hard_to_classify/"])

# print Bayes.accuracy(test)
print Bayes.show_informative_features(10)

コード例 #48

0

ファイルを表示

ファイル: script.py プロジェクト: AbhijeetSri/TextMining

    with open(name) as f:
        text = f.read()
        text = text.replace("\n", " ")
        text = unicode(text, "utf-8", errors="ignore")
        data.append((text, "pro"))
        i += 1


files = glob.glob(NonPropath)
for name in files:
    with open(name) as f:
        text = f.read()
        text = text.replace("\n", " ")
        text = unicode(text, "utf-8", errors="ignore")
        data.append((text, "non-pro"))


random.shuffle(data)
number_of_elements = len(data)
split = (number_of_elements / 3) * 2
train = data[:split]
test = data[split:]

# print 'content of line 5 ' , train[4]

cl = NaiveBayesClassifier(train)
cl.accuracy(test)
cl.classify(
    "Your symptoms may be caused due to a musculo-skeletal strain. I would advise you to take OTC pain-killers/NSAIDS and see if it helps. Rest and ice will also help to relieve the symptoms. If the pain does not get better, you may need to visit your doctor for a physical examination. X-rays will usually be normal in most cases."
)

コード例 #49

0

ファイルを表示

ファイル: spam-nltk.py プロジェクト: CarltonShepherd/machine-learning-resources

        exit(error)
        
    print("Importing...")
    a = time.time()
    data_tuples = get_training_tuples(sys.argv[1])
    print("Data import time: {0}s".format(time.time()-a))
    # Shuffle data: first 250 will be training set; last 250, the test set
    random.seed(1)
    random.shuffle(data_tuples)
    training = data_tuples[:250]
    test = data_tuples[251:500]

    # Train classifier
    print("Training...")
    a = time.time()
    cl = NaiveBayesClassifier(training)
    print("Training time: {0}s".format(time.time()-a))

    # Test classifier
    print("Accuracy: {0}".format(str(cl.accuracy(test))))

    # Classify stuff
    while True:
        text = input("Enter text to classify or 'q' to quit: ")
        if text == 'q':
            print("Exiting")
            break
        else:
            print("Class: {0}".format(cl.classify(text)))

コード例 #50

0

ファイルを表示

ファイル: naivebayes.py プロジェクト: lakeesh10/MrFavorite

if os.path.exists('/home/lakeesh10/Documents/projectdemo/naivebayes_classifier.pickle'):
	print "file exist"
	naive = load_naiveclassifier()
else:
	naive = NaiveBayesClassifier(train)
	save_naiveclassifier(naive)
print "Naive Bayes Trained"

if os.path.exists('/home/lakeesh10/Documents/projectdemo/decisiontree_classifier.pickle'):
	decision = load_decisionclassifier()
else:
	decision = DecisionTreeClassifier(train)
	save_decisionclassifier(decision)
print "Decision Tree Trained"

print("Naive Bayes : ",naive.classify("fried chip good and crunchy dig thattaco tropical omg so eyeopening"))
#print(decision.classify("fried chip good and crunchy dig thattaco tropical omg so eyeopening"))
cl=NaiveBayesAnalyzer()
print (cl.analyze("fried chip good and crunchy dig thattaco tropical omg so eyeopening"))
blob = TextBlob("fried chip good and crunchy dig thattaco tropical omg so eyeopening")
polarity=0
i=0
for sentence in blob.sentences:
	polarity=polarity+sentence.sentiment.polarity
	i=i+1
polarity=polarity/i 
print(polarity)

negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

コード例 #51

0

ファイルを表示

ファイル: demo.py プロジェクト: VatsalBabel/Fake-News-Detection

#Training the classifier on the headline dataset
with open("dataset1.json", 'r', encoding="utf-8-sig") as fp1:
	cl1 = NaiveBayesClassifier(fp1, format="json")

#Training the classifier on the body dataset
with open("dataset2.json", 'r', encoding="utf-8-sig") as fp2:
	cl2 = NaiveBayesClassifier(fp2, format="json")

#Taking the string values
str1 = str(headline)
headline = TextBlob(str1)
body = str(body)
tb_body = TextBlob(body)
subjectivity = tb_body.sentiment.subjectivity
subjectivity = float(subjectivity) * 100
body_classify = str(cl2.classify(body))
body = body.lower()

#Finding the subjectivity
headline = headline.replace('Was', '')
headline = headline.replace('was', '')
headline = headline.replace('’','')

#Finding the tags in the sentence
array = headline.tags
array1 = []

#Finding the hot words
for ii in array:
	name, tag = ii
	name = str(name)

コード例 #52

0

ファイルを表示

ファイル: classify.py プロジェクト: mersanuzun/alsam_mi_ki

#!/usr/bin/python
# -*- coding: utf-8 -*-
from textblob.classifiers import NaiveBayesClassifier
import codecs

train_data = [("bu ürün çok güzel".decode('utf8'), 'pos'),
              ('çok memnunum'.decode('utf8'), 'pos'),
              ('Çok uygun fiyata çok güzel ürün'.decode('utf8'), 'pos'),
              ('Tek kelimeyle harika', 'pos'),
              ('beğenmedim'.decode('utf-8'), 'neg'),
              ('hiç iyi bir ürün değil'.decode('utf8'), 'neg'),
              ('almayın bence'.decode('utf-8'), 'neg')]

reviews = []
with open('../scraping_reviews/pos.txt') as file: 
   for line in file:
       reviews.append((line.decode('utf8'), "pos"))
with open('../scraping_reviews/neg.txt') as file:
   for line in file:
       reviews.append((line.decode('utf8'), 'neg'))
print reviews[len(reviews) - 2]

cl = NaiveBayesClassifier(reviews)
print cl.classify('ürünü gerçekten beğenmedim'.decode('utf8'))

コード例 #53

0

ファイルを表示

ファイル: classify.py プロジェクト: adhim1st/TextClassificationNLP

"Korban diajak tersangka ke musala di dekat pondok. Saat kondisi sepi dan hanya berdua dengan korban, tersangka mencabuli korban," kata Wahyu kepada wartawan, Minggu (20/3/2016).

Lantaran menganggap Nurul sebagai Gus, korban pun tak berani menolak permintaan tersangka. Terlebih lagi, tersangka membujuk korban bahwa perbuatan cabul itu untuk memasukkan ilmu kebatinan ke tubuh korban.

"Tersangka berdalih untuk mengajari korban ilmu tasawuf. Nyatanya itu hanya untuk memuluskan niat tersangka agar bisa mencabuli korban," ungkapnya.

Menurut Wahyu, perbuatan cabul itu dilakukan tersangka kepada korban berulang kali selama 2 tahun terakhir. Bahkan korban diminta membayar uang kepada tersangka setiap kali usai melakukan pencabulan. Nilainya antara Rp 200.000 hingga jutaan rupiah.

"Tersangka juga meminta uang dari korban berulang kali. Total kerugian korban Rp 40 juta," sebutnya.

Tak tahan dengan perbuatan Nurul, lanjut Wahyu, korban pun memutuskan buka mulut ke teman sesama santri. Mendapat dukungan dari teman-temannya, korban memberanikan diri melapor ke Polres Jombang, Kamis (17/3).

Pada hari yang sama, polisi memutuskan menjebak tersangka. "Saat korban menyerahkan uang yang terakhir kepada tersangka, saat itu tersangka langsung kami tangkap," jelasnya.

Akibat perbuatannya, kini Nurul harus mendekam di Rutan Polres Jombang. Tersangka dijerat dengan Pasal 80 ayat (1) juncto Pasal 82 ayat (1) UU RI No 35 Tahun 2014 tentang Perlindungan Anak dengan ancaman pidana maksimal 15 tahun penjara.

"Kalau ada yang merasa menjadi korban perbuatan tersangka ini, jangan malu melapor, akan kami jaga identitasnya. Karena itu bisa memberatkan tersangka," pungkasnya. """

tic = timeit.default_timer()
renum = ''.join([i for i in text if not i.isdigit()])
text = stem_words(renum)
print("text diatas setelah diklasifikasi yaitu %s\n" % cl.classify(text))
toc = timeit.default_timer()
print ("waktu klasifikasi : ")
print(toc-tic)

print(cl.show_informative_features(20))
# classifier = TextBlob(stemstop_output, classifier=cl)
# print(classifier.classify())

コード例 #54

0

ファイルを表示

ファイル: testBlobber.py プロジェクト: weifengli001/pythondev



def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

featuresets = [(extract_features(d), c) for (d, c) in tweets]
print('featuresets: ', len(featuresets))
train_set, test_set = featuresets[:80], featuresets[80:]

blob = TextBlob("It's not the worst.", analyzer=NaiveBayesAnalyzer())
print(blob.sentiment)
blob = TextBlob("It's not the worst")
print(blob.sentiment)


cl = NaiveBayesClassifier(train)
print(cl.classify("It's not the worst"))



#tb = Blobber(analyzer=NaiveBayesAnalyzer())


#print(tb("DonaldTrump under fire for comments about women  weigh in on").sentiment)

コード例 #55

0

ファイルを表示

ファイル: BankClassify.py プロジェクト: robintw/BankClassify

class BankClassify():

    def __init__(self, data="AllData.csv"):
        """Load in the previous data (by default from AllData.csv) and initialise the classifier"""
        if os.path.exists(data):
            self.prev_data = pd.read_csv(data)
        else:
            self.prev_data = pd.DataFrame(columns=['date', 'desc', 'amount', 'cat'])

        self.classifier = NaiveBayesClassifier(self._get_training(self.prev_data), self._extractor)

    def add_data(self, filename):
        """Add new data and interactively classify it.

        Arguments:
         - filename: filename of Santander-format file
        """
        self.new_data = self._read_santander_file(filename)

        self._ask_with_guess(self.new_data)

        self.prev_data = pd.concat([self.prev_data, self.new_data])
        self.prev_data.to_csv("AllData.csv", index=False)

    def _prep_for_analysis(self):
        """Prepare data for analysis in pandas, setting index types and subsetting"""
        self.prev_data = self._make_date_index(self.prev_data)

        self.prev_data['cat'] = self.prev_data['cat'].str.strip()

        self.inc = self.prev_data[self.prev_data.amount > 0]
        self.out = self.prev_data[self.prev_data.amount < 0]
        self.out.amount = self.out.amount.abs()

        self.inc_noignore = self.inc[self.inc.cat != 'Ignore']
        self.inc_noexpignore = self.inc[(self.inc.cat != 'Ignore') & (self.inc.cat != 'Expenses')]

        self.out_noignore = self.out[self.out.cat != 'Ignore']
        self.out_noexpignore = self.out[(self.out.cat != 'Ignore') & (self.out.cat != 'Expenses')]

    def _read_categories(self):
        """Read list of categories from categories.txt"""
        categories = {}

        with open('categories.txt') as f:
            for i, line in enumerate(f.readlines()):
                categories[i] = line.strip()

        return categories

    def _add_new_category(self, category):
        """Add a new category to categories.txt"""
        with open('categories.txt', 'a') as f:
            f.write('\n' + category)

    def _ask_with_guess(self, df):
        """Interactively guess categories for each transaction in df, asking each time if the guess
        is correct"""
        # Initialise colorama
        init()

        df['cat'] = ""

        categories = self._read_categories()

        for index, row in df.iterrows():

            # Generate the category numbers table from the list of categories
            cats_list = [[idnum, cat] for idnum, cat in categories.items()]
            cats_table = tabulate(cats_list)

            stripped_text = self._strip_numbers(row['desc'])

            # Guess a category using the classifier (only if there is data in the classifier)
            if len(self.classifier.train_set) > 1:
                guess = self.classifier.classify(stripped_text)
            else:
                guess = ""


            # Print list of categories
            print(chr(27) + "[2J")
            print(cats_table)
            print("\n\n")
            # Print transaction
            print("On: %s\t %.2f\n%s" % (row['date'], row['amount'], row['desc']))
            print(Fore.RED  + Style.BRIGHT + "My guess is: " + str(guess) + Fore.RESET)

            input_value = input("> ")

            if input_value.lower() == 'q':
                # If the input was 'q' then quit
                return df
            if input_value == "":
                # If the input was blank then our guess was right!
                df.ix[index, 'cat'] = guess
                self.classifier.update([(stripped_text, guess)])
            else:
                # Otherwise, our guess was wrong
                try:
                    # Try converting the input to an integer category number
                    # If it works then we've entered a category
                    category_number = int(input_value)
                    category = categories[category_number]
                except ValueError:
                    # Otherwise, we've entered a new category, so add it to the list of
                    # categories
                    category = input_value
                    self._add_new_category(category)
                    categories = self._read_categories()

                # Write correct answer
                df.ix[index, 'cat'] = category
                # Update classifier
                self.classifier.update([(stripped_text, category)   ])

        return df

    def _make_date_index(self, df):
        """Make the index of df a Datetime index"""
        df.index = pd.DatetimeIndex(df.date.apply(dateutil.parser.parse,dayfirst=True))

        return df

    def _read_santander_file(self, filename):
        """Read a file in the plain text format that Santander provides downloads in.

        Returns a pd.DataFrame with columns of 'date', 'desc' and 'amount'."""
        with open(filename, errors='replace') as f:
            lines = f.readlines()

        dates = []
        descs = []
        amounts = []

        for line in lines[4:]:

            line = "".join(i for i in line if ord(i)<128)
            if line.strip() == '':
                continue

            splitted = line.split(":")

            category = splitted[0]
            data = ":".join(splitted[1:])

            if category == 'Date':
                dates.append(data.strip())
            elif category == 'Description':
                descs.append(data.strip())
            elif category == 'Amount':
                just_numbers = re.sub("[^0-9\.-]", "", data)
                amounts.append(just_numbers.strip())

        df = pd.DataFrame({'date':dates, 'desc':descs, 'amount':amounts})

        df['amount'] = df.amount.astype(float)
        df['desc'] = df.desc.astype(str)
        df['date'] = df.date.astype(str)

        return df

    def _get_training(self, df):
        """Get training data for the classifier, consisting of tuples of
        (text, category)"""
        train = []
        subset = df[df['cat'] != '']
        for i in subset.index:
            row = subset.ix[i]
            new_desc = self._strip_numbers(row['desc'])
            train.append( (new_desc, row['cat']) )

        return train

    def _extractor(self, doc):
        """Extract tokens from a given string"""
        # TODO: Extend to extract words within words
        # For example, MUSICROOM should give MUSIC and ROOM
        tokens = self._split_by_multiple_delims(doc, [' ', '/'])

        features = {}

        for token in tokens:
            if token == "":
                continue
            features[token] = True

        return features

    def _strip_numbers(self, s):
        """Strip numbers from the given string"""
        return re.sub("[^A-Z ]", "", s)

    def _split_by_multiple_delims(self, string, delims):
        """Split the given string by the list of delimiters given"""
        regexp = "|".join(delims)

        return re.split(regexp, string)

コード例 #56

0

ファイルを表示

ファイル: test_classifiers.py プロジェクト: Anhmike/TextBlob

 def test_custom_feature_extractor(self):
     cl = NaiveBayesClassifier(train_set, custom_extractor)
     cl.classify("Yay! I'm so happy it works.")
     assert_equal(cl.train_features[0][1], 'positive')

コード例 #57

0

ファイルを表示

ファイル: email_spam_classifier.py プロジェクト: adityamathur11/mini-project

			for word in words:
				if word not in stopwords.words() and not word.isdigit():
					list_tuples.append((word.lower(),tabsep[0]))
			c+=1
			if c==500:
				break
	return list_tuples
print 'importing data...'
a = time.time()
entire_data = get_list_tuples("dataset.txt")
print "It took "+str(time.time()-a)+" seconds to import data"
print 'data imported'
random.seed(1)
random.shuffle(entire_data)
train = entire_data[:750]
test = entire_data[751:1500]
print 'training data'
a = time.time()
cl = NaiveBayesClassifier(train)
print "It took "+str(time.time()-a)+" seconds to train data"
print 'data trained, now checking accuracy:'
accuracy = cl.accuracy(test)
print "accuracy: "+str(accuracy)
cl.show_informative_features(5)

x = ""
while (x != "exit"):
	x = raw_input("enter a email to check if it is a spam email or not , type exit to exit \n")
	print cl.classify(x)

コード例 #58

0

ファイルを表示

ファイル: test_classifiers.py プロジェクト: Anhmike/TextBlob

 def test_init_with_tsv_file(self):
     with open(TSV_FILE) as fp:
         cl = NaiveBayesClassifier(fp)
     assert_equal(cl.classify("I feel happy this morning"), 'pos')
     training_sentence = cl.train_set[0][0]
     assert_true(isinstance(training_sentence, unicode))