Python Trainer.train Examples, naiveBayesClassifier.trainer.Trainer.train Python Examples

Example #1

0

Show file

    def __call__(self, text):
        context = self.context
        request = self.request
        response = request.response
        catalog = context.portal_catalog

        bayesFilter = api.portal.get_registry_record(
            'i8d.content.browser.coverSetting.ICoverSetting.bayesFilter')

        trainingSet = []
        for line in bayesFilter.split('\n'):
            trainingSet.append({
                'category': 'hasKey',
                'text': safe_unicode(line)
            })

        trainer = Trainer(tokenizer)
        for record in trainingSet:
            trainer.train(record['text'], record['category'])
        classifier = Classifier(trainer.data, tokenizer)

        result = classifier.classify(safe_unicode(text))

        import pdb
        pdb.set_trace()

Example #2

0

Show file

    def getKeywords(self, html):

        text = self.getHtml2Text(html)
#        print text
        text = self.zhsJieba(text)

        #取得registry
        reg = api.portal.get_registry_record('mingjing.content.browser.mjnetSetting.IMJNetSetting.catDict')
        trainSet = []
        for item in reg:
            key = item.split('|||')[0]
            for line in reg[item].split('\n'):
                zhsString = self.zhsJieba(line)
                trainSet.append({'category': key, 'text': zhsString})

        #用簡單貝氏分類文章
        newsTrainer = Trainer(tokenizer)
        for news in trainSet:
            newsTrainer.train(news['text'].encode('utf-8'), news['category'])
        newsClassifier = Classifier(newsTrainer.data, tokenizer)
        classification = newsClassifier.classify(text)
        print classification
#        import pdb; pdb.set_trace()
        if classification[0][1] == 0.0:
            classification.insert(0, (u'n99', 0.0))
        result = []
        for item in classification:
            result.append(item[0])
        return result

Example #3

0

Show file

  def train_spam_texts():
    # Reading dataset file
    dataset_lang = "ru"
    dataset_file = codecs.open(os.path.abspath(os.curdir) + "/data/assets/spam_texts.json", "r", "utf_8_sig")
    dataset_data = json.load(dataset_file)

    # Preparing adverts spam dataset
    prepared_dataset = []
    for idx, item in enumerate(dataset_data[dataset_lang]["adverts"]):
      prepared_dataset.append({
        "text": item["text"],
        "category": "adverts"
      })
    
    # Training
    # (Will be replaced by another library soon)
    advertsTrainer = Trainer(tokenizer)
    for one_dataset_item in prepared_dataset:
      advertsTrainer.train(one_dataset_item["text"], one_dataset_item["category"])
    adverts_classifier = Classifier(advertsTrainer.data, tokenizer)

    # Usage
    # classification = adverts_classifier.classify("рассылка")
    # category_chance = classification[0][1]
    # print(category_chance)

Example #4

0

Show file

File: test_chronological.py Project: ProofpointClinic16/naive-bayes

def classify(filename, size):

    trainingSet, testingSet = make_chronological_sets.create_sets(
        filename, size)

    trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""]))

    for sample in trainingSet:
        trainer.train(sample['url'], sample['result'])

    classifier = Classifier(
        trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""]))

    mal_mal = 0
    mal_clean = 0
    clean_clean = 0
    clean_mal = 0

    for sample in testingSet:

        predicted = classifier.classify(sample['url'])[0][0]
        actual = sample['result']

        if predicted == 'malicious' and actual == 'malicious':
            mal_mal += 1
        elif predicted == 'malicious' and actual == 'clean':
            mal_clean += 1
        elif predicted == 'clean' and actual == 'clean':
            clean_clean += 1
        elif predicted == 'clean' and actual == 'malicious':
            clean_mal += 1

    prop_caught = float(mal_mal) / float(mal_mal + clean_mal)
    prop_missed = float(clean_mal) / float(mal_mal + clean_mal)

    ## Stuff to get proportions:

    # size = float(size)

    # mal_mal = float(mal_mal)/size
    # mal_clean = float(mal_clean)/size
    # clean_mal = float(clean_mal)/size
    # clean_clean = float(clean_clean)/size

    ## Confusion matrix stuff:

    # confusionMatrix = [['Actually malicious', mal_mal, clean_mal], ['Actually clean', mal_clean, clean_clean]]

    # print tabulate(confusionMatrix, headers=['', 'Predicted malicious', 'Predicted clean'])

    print "Total: " + str(mal_mal + mal_clean + clean_mal + clean_clean)
    print "Malware: " + str(mal_mal + clean_mal)
    print "Clean: " + str(mal_clean + clean_clean)
    print "Caught: " + str(mal_mal) + " (" + "{:.1%}".format(prop_caught) + ")"
    print "Missed: " + str(clean_mal) + " (" + "{:.1%}".format(
        prop_missed) + ")"

Example #5

0

Show file

File: lib.py Project: thibw/Mineure-tech

def get_classer():
    newsTrainer = Trainer(tokenizer)

    for news in newsSet:
        newsTrainer.train(news['text'], news['category'])

    # When you have sufficient trained data, you are almost done and can start to use
    # a classifier.
    newsClassifier = Classifier(newsTrainer.data, tokenizer)
    return newsClassifier

Example #6

0

Show file

File: NBC.py Project: ikafire/PTT-Moderator-Helper

class NaiveBayesClassifier:
    def __init__(self):
        jieba.set_dictionary('dict.big.txt')
        self.articleTrainer = Trainer(tokenizer)

    def train(self):
        # Training
        articles = article.create_articles_from_file("data/HatePoliticsdata.json")
        p_train = articles[0:3001]
        p_test = articles[3001:3031]

        for a in p_train:
            doc = a.body
            #seg_list = jieba.lcut(doc, cut_all=False)
            seg_list = jieba.analyse.extract_tags(doc)
            doc = " ".join(seg_list)
            self.articleTrainer.train(doc, 'politics')

        articles = article.create_articles_from_file("data/Gossipingdata.json")
        g_train = articles[0:3000]
        g_test = articles[3001:3301]

        for a in g_train:
            doc = a.body
            #seg_list = jieba.lcut(doc, cut_all=False)
            seg_list = jieba.analyse.extract_tags(doc)
            doc = " ".join(seg_list)
            self.articleTrainer.train(doc, 'gossiping')
        f = open('data/docCountOfClasses.json', 'w', -1, 'utf-8')
        f.write(json.dumps(self.articleTrainer.data.docCountOfClasses))
        f.close()
        f = open('data/frequencies.json', 'w', -1, 'utf-8')
        f.write(json.dumps(self.articleTrainer.data.frequencies))
        f.close()
        

    def classify(self, article):
        self.data = TrainedData()
        f = open('data/docCountOfClasses.json', 'r', -1, 'utf-8')
        self.data.docCountOfClasses = json.load(f)
        f.close()
        f = open('data/frequencies.json', 'r', -1, 'utf-8')
        self.data.frequencies = json.load(f)
        f.close()
        #Testing
        self.articleClassifier = Classifier(self.data, tokenizer)
        doc = article.body
        #seg_list = jieba.lcut(doc, cut_all=False)
        seg_list = jieba.analyse.extract_tags(doc)
        doc = " ".join(seg_list)
        classification = self.articleClassifier.classify(doc)
        return classification[0][0]

Example #7

0

Show file

File: test_single.py Project: ProofpointClinic16/naive-bayes

def classify(filename, size, url, result):

    trainingSet = make_training_set.create_set(filename, size)

    trainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = [""]))

    for sample in trainingSet:
        trainer.train(sample['url'], sample['result'])

    classifier = Classifier(trainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = [""]))

    print "Expected: " + result
    print classifier.classify(url)

Example #8

0

Show file

File: model_generator.py Project: alpayOnal/genderizer

def generate(mongourl, database, collection, lang):

    c = MongoClient(mongourl)
    tweets = c[database][collection].find()
    trainer = Trainer(tokenizer)
    
    for tweet in tweets:
        trainer.train(tweet['tweet'], tweet['gender'])
    
    modelFileName = 'model_{}.txt'.format(lang)
    with open(modelFileName, 'wb') as modelFile:
        cPickle.dump(trainer.data, modelFile, cPickle.HIGHEST_PROTOCOL)
        print('OK : generated trained data has been writen in the file "{}"'.
            format(modelFileName))

Example #9

0

Show file

def generate(mongourl, database, collection, lang):

    c = MongoClient(mongourl)
    tweets = c[database][collection].find()
    trainer = Trainer(tokenizer)

    for tweet in tweets:
        trainer.train(tweet['tweet'], tweet['gender'])

    modelFileName = 'model_{}.txt'.format(lang)
    with open(modelFileName, 'wb') as modelFile:
        cPickle.dump(trainer.data, modelFile, cPickle.HIGHEST_PROTOCOL)
        print('OK : generated trained data has been writen in the file "{}"'.
              format(modelFileName))

Example #10

0

Show file

def post_logfile():
    if request.method == 'GET':
        log_file = request.args['symptom']
        print(log_file)
        diseaseclassifier = Trainer(tokenizer)  #STARTS CLASIFIERS
        with open("Dataset.csv", "r") as file:  #OPENS DATASET
            for i in file:  #FOR EACH LINE
                lines = file.next().split(",")  #PARSE CSV <DISEASE> <SYMPTOM>
                diseaseclassifier.train(lines[1], lines[0])  #TRAINING
        diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer)
        classification = diseaseclassifier.classify(log_file)  #CLASIFY INPUT
        print classification

        return json.dumps(dict(classification))

Example #11

0

Show file

def create_naive_bayes_classifier(training_examples, training_annotations):
    print("creating naive bayes classifier")
    annotations = [categories[x] for x in training_annotations]

    news_trainer = Trainer(
        tokenizer.Tokenizer(stop_words=[],
                            signs_to_remove=[tokenization_string]))
    for example, annotation in zip(training_examples, annotations):
        news_trainer.train(example, annotation)
    classifier = Classifier(
        news_trainer.data,
        tokenizer.Tokenizer(stop_words=[],
                            signs_to_remove=[tokenization_string]))
    print("\t->done")
    return classifier

Example #12

0

Show file

File: api.py Project: nevingv/ML-Python

def post_logfile():
    if request.method == 'POST':
        log_file = request.args['symptom']
        print(log_file)
        diseaseclassifier = Trainer(tokenizer)  #STARTS CLASIFIERS
        with open("Dataset.csv", "r") as file:  #OPENS DATASET
            for i in file:  #FOR EACH LINE
                lines = file.next().split(",")  #PARSE CSV <DISEASE> <SYMPTOM>
                diseaseclassifier.train(lines[1], lines[0])  #TRAINING
        diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer)
        classification = diseaseclassifier.classify(log_file)  #CLASIFY INPUT
        print classification
        result = []
        for item in classification:
            obj = CustomType(item[0], item[1])
            result.append(json.loads(obj.toJSON()))
        # return json.dumps(OrderedDict(classification))
        return json.dumps(result, indent=4)

Example #13

0

Show file

File: test_balanced.py Project: ProofpointClinic16/naive-bayes

def classify(filename, size):

    trainingSet, testingSet = make_balanced_sets.create_sets(filename, size)

    trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""]))

    for sample in trainingSet:
        trainer.train(sample['url'], sample['result'])

    classifier = Classifier(
        trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""]))

    mal_mal = 0
    mal_clean = 0
    clean_clean = 0
    clean_mal = 0

    for sample in testingSet:

        predicted = classifier.classify(sample['url'])[0][0]
        actual = sample['result']

        if predicted == 'malicious' and actual == 'malicious':
            mal_mal += 1
        elif predicted == 'malicious' and actual == 'clean':
            mal_clean += 1
        elif predicted == 'clean' and actual == 'clean':
            clean_clean += 1
        elif predicted == 'clean' and actual == 'malicious':
            clean_mal += 1

    size = float(size)

    mal_mal = float(mal_mal) / size
    mal_clean = float(mal_clean) / size
    clean_mal = float(clean_mal) / size
    clean_clean = float(clean_clean) / size

    confusionMatrix = [[mal_mal, clean_mal], [mal_clean, clean_clean]]

    pprint(confusionMatrix)
    print "Accuracy: " + str(mal_mal + clean_clean)
    print "False positives (predicted clean when malicious): " + str(clean_mal)
    print "False negatives (predicted malicious when clean): " + str(mal_clean)

Example #14

0

Show file

File: main.py Project: dormantman/Olimpiads

 def neyronka(self, _str):
     newsTrainer = Trainer(tokenizer)
     with open('o', 'rt', encoding='utf8') as csvfile:
         res = '['
         for i in csvfile.readlines():
             if i == '\n':
                 continue
             else:
                 theme, text = i.split('***')
                 res += '{\'text\':' + '\'' + text.strip() + '\'' + ', ' + '\'category\':' + '\'' + str(
                     theme) + '\'},\n'
         res += ']'
         newsSet = eval(res)
         for news in newsSet:
             newsTrainer.train(news['text'], news['category'])
         newsClassifier = Classifier(newsTrainer.data, tokenizer)
         unknownInstance = _str
         classification = newsClassifier.classify(unknownInstance)
         return (sorted(classification, key=(lambda x: -x[1])))

Example #15

0

Show file

File: nb_comparisons.py Project: aew61/mlpy

def create_nbc_nb_classifier(training_dataset):
    training_examples, training_annotations = training_dataset
    # training_annotations = [int(not bool(annotation)) for annotation in training_annotations]
    parsed_training_examples = [
        set(tokenize(example)) for example in training_examples
    ]

    tr = Trainer(
        tokenizer.Tokenizer(stop_words=[],
                            signs_to_remove=[tokenization_string]))
    for example, annotation in zip(parsed_training_examples,
                                   training_annotations):
        tr.train(example, annotation)

    print("number of tokens seen: %s" % len(tr.data.frequencies.keys()))
    return tr, Classifier(
        tr.data,
        tokenizer.Tokenizer(stop_words=[],
                            signs_to_remove=[tokenization_string]))

Example #16

0

Show file

File: classifier.py Project: mgupta1410/twitter-topic-reach

def tweet_classification(unknownInstance):
    newsTrainer = Trainer(tokenizer)
    with open("train.txt") as f:
        for line in f:
            str = line
            str = str.split(' ', 1 );
            newsTrainer.train(str[1], str[0])
    newsClassifier = Classifier(newsTrainer.data, tokenizer)
	# Now you have a classifier which can give a try to classifiy text of news whose
	# category is unknown, yet.
    classification = newsClassifier.classify(unknownInstance)
	# the classification variable holds the possible categories sorted by
	# their probablity value
    ans = dict()
    for i in range(3):
        if(classification[0][1]!=0.0):
            ans[classification[i][0]] = classification[i][1] / classification[0][1];
            #print classification
    #print ans
    return ans

Example #17

0

Show file

File: add_point_to_machine.py Project: jeanpierreDuman/truepaper-django

def determine(sentence):
    newsTrainer = Trainer(tokenizer)
    newsSet = []

    with open('data.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            newsSet.append({'fact': row['Fact'], 'decision': row['Decision']})

    for news in newsSet:
        newsTrainer.train(news['fact'], news['decision'])

    newsClassifier = Classifier(newsTrainer.data, tokenizer)
    classification = newsClassifier.classify(sentence)
    # False
    false = classification[0][1]
    false = str(false).split('.')[0]
    # True
    true = classification[1][1]
    true = str(true).split('.')[0]
    data = [true, false]
    return data

Example #18

0

Show file

File: NaiveBayesClassifier.py Project: ikafire/PTT-Moderator-Helper

class NaiveBayesClassifier:
    def __init__(self):
        jieba.set_dictionary('dict.big.txt')
        self.articleTrainer = Trainer(tokenizer)

    def train(self):
        # Training
        articles = article.create_articles_from_file("data/HatePoliticsdata.json")
        p_train = articles[0:3001]
        p_test = articles[3001:3031]

        for a in p_train:
            doc = a.body
            #seg_list = jieba.lcut(doc, cut_all=False)
            seg_list = jieba.analyse.extract_tags(doc)
            doc = " ".join(seg_list)
            self.articleTrainer.train(doc, 'politics')

        articles = article.create_articles_from_file("data/Gossipingdata.json")
        g_train = articles[0:3000]
        g_test = articles[3001:3301]

        for a in g_train:
            doc = a.body
            #seg_list = jieba.lcut(doc, cut_all=False)
            seg_list = jieba.analyse.extract_tags(doc)
            doc = " ".join(seg_list)
            self.articleTrainer.train(doc, 'gossiping')

    def classify(self, article):
        #Testing
        self.articleClassifier = Classifier(articleTrainer.data, tokenizer)
        doc = a.body
        #seg_list = jieba.lcut(doc, cut_all=False)
        seg_list = jieba.analyse.extract_tags(doc)
        doc = " ".join(seg_list)
        classification = articleClassifier.classify(doc)
        return classification[0][0]

Example #19

0

Show file

File: chat.py Project: TimothyTakudzwa/intelli-website

#         'answer': 'Yes, we have a 24 Hour Customer Contact Centre where you can get support related to your banking enquiries. You can call the numbers: +263 772 244 788, +263...'},
#     {'question': 'Is there a 24 hour Customer Contact Centre?',
#         'answer': 'Yes, we have a 24 Hour Customer Contact Centre where you can get support related to your banking enquiries. You can call the numbers: +263 772 244 788, +263...'},
#     {'question': 'Is there a way I can check my account balance other than contacting the branch?',
#         'answer': 'Yes, you can check your balance through our ATM network, NMBMobile App or Internet Banking.'},
#     {'question': 'What is an e-Statement?',
#         'answer': 'An e-Statement is an electronic version of your paper bank statement which is emailed directly to your registered email address in a password protected PDF ...'},
#     {'question': 'How can I transfer money to a bank account abroad?',
#         'answer': 'This service is currently available for Corporate clients only, subject to availability of funds and the RBZ priority payments list'},
#     {'question': 'How do I get internal funds transfer forms?',
#         'answer': 'We are no longer accepting paper transfer forms, please register for Mobile Banking at your nearest branch or enrol for Internet Banking here.'},
#     {'question': 'How can I get a Point of Sale Machine for my business?',
#         'answer': 'You can submit an application letter detailing the following : Business name Bank account number Nature of business Contact person & number Number...'}
# ]
for news in newsSet:
    newsTrainer.train(news['question'], news['answer'])

# When you have sufficient trained data, you are almost done and can start to use
# a classifier.
newsClassifier = Classifier(newsTrainer.data, token)

# print(newsClassifier.accuracy(newsSet))

# Now you have a classifier which can give a try to classifiy text of news whose
# category is unknown, yet.
classification = newsClassifier.classify("f**k")


# the classification variable holds the detected categories sorted
for cl in classification[:5]:
    print(cl)

Example #20

0

Show file

File: routes.py Project: supersangjin/naive_bayes_classifier

def classify(input):
    twitter = Twitter()

    f = open("data.txt", "r")

    data = json.loads(f.read())

    gradeTrainer = Trainer(tokenizer)
    loadTrainer = Trainer(tokenizer)
    lectureTrainer = Trainer(tokenizer)

    print("Training grade ...")
    for subject in data:
        if subject["grade"] != "?":
            review = subject["comment"].replace('.', '\n').split("\n")
            for li in review:
                if len(li.strip()) != 0:
                    gradeTrainer.train(li, subject["grade"])

    print("Training load ...")
    for subject in data:
        if subject["load"] != "?":
            review = subject["comment"].replace('.', '\n').split("\n")
            for li in review:
                if len(li.strip()) != 0:
                    loadTrainer.train(li, subject["load"])

    print("Training lecture ...")
    for subject in data:
        if subject["lecture"] != "?":
            review = subject["comment"].replace('.', '\n').split("\n")
            for li in review:
                if len(li.strip()) != 0:
                    lectureTrainer.train(li, subject["lecture"])

    gradeClassifier = Classifier(gradeTrainer.data, tokenizer)
    loadClassifier = Classifier(loadTrainer.data, tokenizer)
    lectureClassifier = Classifier(lectureTrainer.data, tokenizer)

    input = u"" + input
    classify_input = []

    for element in twitter.pos(input):
        if element[1] == 'Noun':
            classify_input.append(element[0])
        elif element[1] == 'Verb':
            classify_input.append(element[0])
        elif element[1] == 'Adjective':
            classify_input.append(element[0])
        elif element[1] == 'Adverb':
            classify_input.append(element[0])
        elif element[1] == 'Exclamation':
            classify_input.append(element[0])
        elif element[1] == 'Alpha':
            classify_input.append(element[0])
        elif element[1] == 'KoreanParticle':
            classify_input.append(element[0])

    text = " ".join(classify_input)

    print(text)

    gradeClassification = gradeClassifier.classify(text)
    loadClassification = loadClassifier.classify(text)
    lectureClassification = lectureClassifier.classify(text)

    print(
        "\n________________________________________GRADE________________________________________\n"
    )
    print(gradeClassification)
    print(
        "\n________________________________________LOAD_________________________________________\n"
    )
    print(loadClassification)
    print(
        "\n________________________________________LECTURE______________________________________\n"
    )
    print(lectureClassification)

    return gradeClassification, loadClassification, lectureClassification

Example #21

0

Show file

File: classify.py Project: mrshu/user-rights-ML

tosTrainer = Trainer(tokenizer)

def get_corp(read_file):
	with open(read_file,"r") as r:
		corpus = []
		for line in r:
			tabsep = line.decode('utf-8').strip().split('\t')
			a = {}
			a['text'] = tabsep[0]
			a['rating'] = tabsep[1]
			corpus.append(a)
		return corpus

# get the corpus from a training set - using copyright clauses here as an example (a subset of the csv generated by the getpointsdata.py script)
tosSet = get_corp("tosdr.org/copyrighttrainset.txt")

# You need to train the system passing each text one by one to the trainer module.
for corpi in tosSet:
    tosTrainer.train(corpi['text'], corpi['rating'])

# When you have sufficient trained data, you are almost done and can start to use a classifier.
tosClassifier = Classifier(tosTrainer.data, tokenizer)

# Now you have a classifier which can give a try to classifiy text of policy clauses whose rating is unknown, yet. Example here drawn from test set
unknownInstance = "You are free to choose your own copyright license for your content in your account settings: Public Domain Creative Commons non commercial or free licenses but also classic copyright if you wish so."
classification = tosClassifier.classify(unknownInstance)

# the classification variable holds the possible categories sorted by their probablity value
print classification

Example #22

0

Show file

File: naive_bayes.py Project: jonjitsu/pyledgertools

class Classifier(object):
    """Custom class to implement naive bayes classification using
    naiveBayesClassifier.

    Attributes:
        classifier (Classifier object): Object of class `Classifier` for
            classifying transactios based on existing journal.
    """

    class NotImplemented(Exception):
        pass

    def __init__(self, journal=None):
        """Classifer initialization.

        Parameters:
            journal_file (str): Journal file string to import.
        """
        self._tknizer = tokenizer.Tokenizer(signs_to_remove=['?!%.'])
        self._trainer = Trainer(self._tknizer)
        if journal is not None:
            journal_data = train_journal(journal)

            for group in journal_data:
                # 0: Allocation account.
                # 1: List of transactions.
                # 2: Greatest common multiple of values in transactions.
                for transaction in group[1]:
                    # 0: Transaction payee string.
                    # 1: Allocation account.
                    self._trainer.train(transaction[0], transaction[1])

            self._classifier = BayesClassifier(
                self._trainer.data,
                self._tknizer
            )
        else:
            self._classifier = None

    def update(self, text, category):
        """Update training data with new examples.

        Adds new data to the trainer then generates a new classifier. Can be
        useful for updating on the fly if performing an interactive data import.

        Parameters:
            text (str): New text to classify.
            category (str): Classification of `text`.
        """
        self._trainer.train(text, category)
        self._classifier = BayesClassifier(
            self._trainer.data,
            self._tknizer
        )

    def classify(self, text, method='bayes'):
        """Give classifcation for a text string using bayes classification.

        Parameters:
            text (str): Text to classify.
            method (str): Type of classification to use. Default to `bayes`.
        Returns:
            list: Available categories and their probabilities.
        """

        if method == 'bayes':
            if self._classifier is not None:
                return self._classifier.classify(text)
            else:
                return None

        elif method == 'rules':
            raise NotImplementedError(
                'Classification based on rules file not yet implemented'
            )
        else:
            raise NotImplemented('The method `{}` is not valid'.format(method))

Example #23

0

Show file

    def get(self):
        try:
            print "  "
            print "TestClassifier start"
            print "  "
            # pasar  los stop words a lista desde el file
            with open("stop_words.txt", "r") as ins:
                array = []
                for line in ins:
                    array.append((line.rstrip('\n')).decode('unicode-escape'))
            #print array
            newsTrainer = Trainer(
                tokenizer.Tokenizer(stop_words=array,
                                    signs_to_remove=["?!#%&_"]))

            hoy = date.today()

            query = News3.query(News3.date == hoy,
                                News3.news_from.IN([
                                    "uy_press",
                                ]), News3.category == "Política")

            # You need to train the system passing each text one by one to the trainer module.
            #newsSet =[
            #    {'text': 'not to eat too much is not enough to lose weight', 'category': 'health'},
            #    {'text': 'Russia try to invade Ukraine', 'category': 'politics'},
            #    {'text': 'do not neglect exercise', 'category': 'health'},
            #    {'text': 'Syria is the main issue, Obama says', 'category': 'politics'},
            #    {'text': 'eat to lose weight', 'category': 'health'},
            #    {'text': 'you should not eat much', 'category': 'health'}
            #]

            query2 = News3.query(News3.date == hoy,
                                 News3.news_from == "uy_press",
                                 News3.category == "deportes")

            query4 = News3.query(News3.date == hoy,
                                 News3.news_from == "uy_press",
                                 News3.category == "salud")

            #for news in newsSet:
            #    newsTrainer.train(news['text'], news['category'])
            c = 0
            #print query
            for i in query:
                print "  "
                print i.category
                newsTrainer.train(i.html, 'politica')
                #if c == 10: break
                c += 1

            #for i in query2:
            #	newsTrainer.train(i.html, 'deportes')
            #raise Exception('I know Python!')

            #for i in query4:
            #	newsTrainer.train(i.html, 'salud')

            # When you have sufficient trained data, you are almost done and can start to use
            # a classifier.

            # Now you have a classifier which can give a try to classifiy text of news whose
            # category is unknown, yet.
            query3 = News3.query(
                News3.date == hoy,
                News3.news_from.IN([
                    "el_pais",
                ]),
                News3.id.IN([0]),
            )

            ###
            newsClassifier = Classifier(
                newsTrainer.data,
                tokenizer.Tokenizer(stop_words=array,
                                    signs_to_remove=["?!#%&"]))
            #print unknownInstance
            classification = newsClassifier.classify(
                "Vidalín: No quiero que me llamen para saber qué tramite hay que hacer para poner un prostíbulo"
            )

            # the classification variable holds the detected categories sorted
            print " classification "
            print(classification)
        except:
            print traceback.format_exc()

Example #24

0

Show file

def get_model(trump, cruz, kasich, clinton, sanders):
    trainer = Trainer(tokenizer)

    twiSet = []
    for address in trump:
        with open(
                "/Users/Helicopter/Desktop/Logan_s/Courses/Stat_AI_ML/election/election_twitters/trump/"
                + address, "r") as text_file:
            content = ""
            for line in text_file:
                for word in line.split():
                    content = content + word + " "
            struct = {'text': content, 'category': 'trump'}
        twiSet.append(struct)
        text_file.close()

    for address in cruz:
        with open(
                "/Users/Helicopter/Desktop/Logan_s/Courses/Stat_AI_ML/election/election_twitters/cruz/"
                + address, "r") as text_file:
            content = ""
            for line in text_file:
                for word in line.split():
                    content = content + word + " "
            struct = {'text': content, 'category': 'cruz'}
        twiSet.append(struct)
        text_file.close()

    for address in kasich:
        with open(
                "/Users/Helicopter/Desktop/Logan_s/Courses/Stat_AI_ML/election/election_twitters/kasich/"
                + address, "r") as text_file:
            content = ""
            for line in text_file:
                for word in line.split():
                    content = content + word + " "
            struct = {'text': content, 'category': 'kasich'}
        twiSet.append(struct)
        text_file.close()

    for address in clinton:
        with open(
                "/Users/Helicopter/Desktop/Logan_s/Courses/Stat_AI_ML/election/election_twitters/clinton/"
                + address, "r") as text_file:
            content = ""
            for line in text_file:
                for word in line.split():
                    content = content + word + " "
            struct = {'text': content, 'category': 'clinton'}
        twiSet.append(struct)
        text_file.close()

    for address in sanders:
        with open(
                "/Users/Helicopter/Desktop/Logan_s/Courses/Stat_AI_ML/election/election_twitters/sanders/"
                + address, "r") as text_file:
            content = ""
            for line in text_file:
                for word in line.split():
                    content = content + word + " "
            struct = {'text': content, 'category': 'sanders'}
        twiSet.append(struct)
        text_file.close()

    for twi in twiSet:
        trainer.train(twi['text'], twi['category'])

    newclassifier = Classifier(trainer.data, tokenizer)

    return newclassifier

Example #25

0

Show file

File: test1.py Project: rajibmitra/AlmabaseNewsScrapper

def article_keywords(article):
    keys = Keywords.objects.get(article=article)
    print keys
    l = [k.keyword for k in keys.keywords.all()]
    print " ".join(l)
    keyset = {'keyword': " ".join(l)}
    return keyset


if __name__ == '__main__':
    print "Starting testing of Bayes Classifer"
    labeled_articles = [
        (a, a.relevant)
        for a in Article.objects.all()[:(len(Article.objects.all()))]
    ]
    print labeled_articles
    featuresets = []
    for (article, relevant) in labeled_articles:
        r = article_keywords(article)
        featuresets.append((r, relevant))
    print featuresets
    train_set, test_set = featuresets[:(len(featuresets))], featuresets[(
        len(featuresets) - 2):]
    print train_set
    newsTrainer = Trainer(tokenizer)
    for f in train_set:
        newsTrainer.train(f[0]['keyword'], f[1])
    newsClassifier = Classifier(newsTrainer.data, tokenizer)
    url = raw_input("Enter the url: ")
    testurl(url, newsClassifier)

Example #26

0

Show file

    ['Cabbage Loopers', 'holes on leaves'],
    [
        'Cutworms',
        'fat caterpillars, basically gray, brown, or black with 41 to 51 mm long when fully grown'
    ], ['Cutworms', 'damaged stem'],
    [
        'Bacterial Leaf Spot',
        'small water-soaked spots on older leaves then quickly turn black'
    ], ['Bacterial Leaf Spot', 'holes on leaves'],
    ['Lettuce Drop', 'older leaves wilt'],
    ['Lettuce Drop', 'older leaves collapse'],
    ['Lettuce Drop',
     'brown crown tissue'], ['Lettuce Drop', 'holes on leaves'],
    ['Anthracnose', 'water-soaked spots that turn yellow'],
    [
        'Anthracnose',
        'white to pink spore masses of the fungus in the center of the lesions'
    ], ['Anthracnose', 'damaged leaf becomes papery'],
    ['Anthracnose', 'holes on leaves'], ['Tipburn', 'browing of leaf margins'],
    ['Tipburn', 'brown veins']
]
disease_classifier = Trainer(tokenizer)
for data in dataset:
    disease_classifier.train(data[1], data[0])
disease_classifier = Classifier(disease_classifier.data, tokenizer)
classifications = disease_classifier.classify(sys.argv[1])
classifications_list = []
for classification in classifications:
    classifications_list.append(classification[0])
print json.dumps({'classifications': classifications_list})

Example #27

0

Show file

File: naive_bayers_classifier.py Project: Parag08/interesting_questions

documentTrainer = Trainer(tokenizer)

documentSet = []


def getTextBasedOnDocumentID(documentID):
    ID = int(documentID.split('_')[1])
    line = linecache.getline('../2.document_set/document_set.csv', ID + 2)
    text = line.split(',"')[1]
    return text


for i in range(0, len(traincsv)):
    documentSet.append({
        'text': getTextBasedOnDocumentID(traincsv[i][0]),
        'category': traincsv[i][1]
    })

for documents in documentSet:
    documentTrainer.train(documents['text'], documents['category'])

newsClassifier = Classifier(documentTrainer.data, tokenizer)

for i in range(0, len(testcsv)):
    data = getTextBasedOnDocumentID(testcsv[i][0])
    classification = newsClassifier.classify(data)
    testcsv[i][1] = int(classification[0][0])
df = pd.DataFrame(testcsv)
df.to_csv("../5.evaluation_file/predicted_cat.csv", index=False)
#np.savetxt("./5.evaluation_file/predicted_cat.csv", testcsv,header="document_id,category" ,delimiter=",")

Example #28

0

Show file

File: newsClassifier.py Project: GeorgeMcIntire/naive-bayes-classifier

You want to train a system with this pre-categorized/pre-classified 
texts. So, you have better call this data your training set.
"""
from naiveBayesClassifier import tokenizer
from naiveBayesClassifier.trainer import Trainer
from naiveBayesClassifier.classifier import Classifier

newsTrainer = Trainer(tokenizer)

# You need to train the system passing each text one by one to the trainer module.
newsSet =[
    {'text': 'not to eat too much is not enough to lose weight', 'category': 'health'},
    {'text': 'Russia try to invade Ukraine', 'category': 'politics'},
    {'text': 'do not neglect exercise', 'category': 'health'},
    {'text': 'Syria is the main issue, Obama says', 'category': 'politics'},
    {'text': 'eat to lose weight', 'category': 'health'},
    {'text': 'you should not eat much', 'category': 'health'}
]
for news in newsSet:
    newsTrainer.train(news['text'], news['category'])

# When you have sufficient trained data, you are almost done and can start to use
# a classifier.
newsClassifier = Classifier(newsTrainer.data, tokenizer)

# Now you have a classifier which can give a try to classifiy text of news whose
# category is unknown, yet.
classification = newsClassifier.classify("Obama is")

# the classification variable holds the detected categories sorted
print(classification)

Example #29

0

Show file

from naiveBayesClassifier.classifier import Classifier

dir_path = os.path.dirname(os.path.realpath(__file__))
diseaseclassifier = Trainer(tokenizer)

disease_symptom_dict = dict()
with open(dir_path+"/priors.csv", 'r') as p:
    for prior in p:
        className, count = prior.split(',')
        count = int(count)
        diseaseclassifier.setPriors(className, count)

with open(dir_path+"/Dataset.csv", "r+", encoding="utf-8") as file:
    for i in file:
        lines = i.split(",")
        diseaseclassifier.train(lines[1].strip('\n'),  lines[0])
        if lines[0] in disease_symptom_dict:
            disease_symptom_dict[lines[0]].append(lines[1].strip('\n'))
        else:
            disease_symptom_dict[lines[0]] = [lines[1].strip('\n')]


diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer)
initial_submitted_symptoms = sys.argv[1].split(',')


def formatted_print(symptoms, disease_predictions):
    print('\n'+'*'*20)
    print('Symptoms:')
    print(','.join(symptoms))
    print('\n'+'*'*20)

Example #30

0

Show file

File: GuessDisease.py Project: haadeescott/DiseaseClassifier

    {'symptoms': 'unresponsiveness', 'disease': 'dementia'},
    {'symptoms': 'lethargy', 'disease': 'dementia'},
    {'symptoms': 'agitation', 'disease': 'dementia'},
    {'symptoms': 'ecchymosis', 'disease': 'dementia'},
    {'symptoms': 'syncope', 'disease': 'dementia'},
    {'symptoms': 'rale', 'disease': 'dementia'},
    {'symptoms': 'unconscious state', 'disease': 'dementia'},
    {'symptoms': 'cough', 'disease': 'dementia'},
    {'symptoms': 'bedridden', 'disease': 'dementia'},
    {'symptoms': 'unsteady gait', 'disease': 'dementia'},
    

    ]



for news in newsSet:
    newsTrainer.train(news['symptoms'], news['disease'])

# When you have sufficient trained data, you are almost done and can start to use
# a classifier.
newsClassifier = Classifier(newsTrainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"]))

# Now you have a classifier which can give a try to classifiy text of news whose
# category is unknown, yet.
unknownInstance = "pain fever coughing"
classification = newsClassifier.classify(unknownInstance)

# the classification variable holds the possible categories sorted by 
# their probablity value
print(classification)

Example #31

0

Show file

File: test1.py Project: IlluminatiNITW/AlmabaseNewsScrapper

def article_keywords(article):
    keys=Keywords.objects.get(article=article)
    print keys
    l=[k.keyword for k in keys.keywords.all()]
    print " ".join(l)
    keyset={'keyword':" ".join(l)}
    return keyset

if __name__ == '__main__':
    print "Starting testing of Bayes Classifer"
    labeled_articles = [(a, a.relevant) for a in Article.objects.all()[:(len(Article.objects.all()))]]
    print labeled_articles
    featuresets=[]
    for (article, relevant) in labeled_articles:
    	r=article_keywords(article)
    	featuresets.append((r,relevant))
    print featuresets
    train_set, test_set = featuresets[:(len(featuresets))], featuresets[(len(featuresets)-2):]
    print train_set
    newsTrainer = Trainer(tokenizer)
    for f in train_set:
        newsTrainer.train(f[0]['keyword'],f[1])
    newsClassifier = Classifier(newsTrainer.data, tokenizer)
    url=raw_input("Enter the url: ")
    testurl(url,newsClassifier)

Example #32

0

Show file

File: GuessDisease.py Project: RohitPatilRRP/DiseasePredictor

import json, os, sys, re
from naiveBayesClassifier import tokenizer
from naiveBayesClassifier.trainer import Trainer
from naiveBayesClassifier.classifier import Classifier

diseaseclassifier = Trainer(tokenizer)
with open("Dataset.csv", "r") as file:
    for i in file:  #FOR EACH LINE
        lines = file.readline().split(",")
        diseaseclassifier.train(lines[1], lines[0])
diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer)
txt = input("enter symptomA symptomB symptomC")
classification = diseaseclassifier.classify(txt)
print(classification[0])

Example #33

0

Show file

File: docClassarXiv2.py Project: apoorvakh/Expertise-Classification-of-Researchers

documents = []
classes = []

names = ['AI', 'CL', 'DS', 'GR', 'CR', 'PF']
for n in names:
    filename = "ncs-" + n + ".csv"
    docsf = csv.reader(open(filename, "rb"))
    dataset = list(docsf)
    for i in dataset:
        docSet.append({'text': i[1], 'category': i[2]})

# You need to train the system passing each text one by one to the trainer module.

for doc in docSet:
    newsTrainer.train(doc['text'], doc['category'])

# When you have sufficient trained data, you are almost done and can start to use
# a classifier.
newsClassifier = Classifier(newsTrainer.data, tokenizer)

# Now you have a classifier which can give a try to classifiy text of news whose
# category is unknown, yet.
#unknownInstance = "Even if I eat too much, is not it possible to lose some weight"
unknownInstance = "This paper describes a system, called PLP, for compiling ordered logic programs into standard logic programs under the answer set semantics. In an ordered logic program, rules are named by unique terms, and preferences among rules are given by a set of dedicated atoms. An ordered logic program is transformed into a second, regular, extended logic program wherein the preferences are respected, in that the answer sets obtained in the transformed theory correspond with the preferred answer sets of the original theory. Since the result of the translation is an extended logic program, existing logic programming systems can be used as underlying reasoning engine. In particular, PLP is conceived as a front-end to the logic programming systems dlv and smodels."

classification = newsClassifier.classify(unknownInstance)
# the classification variable holds the possible categories sorted by
# their probablity value
print classification

Example #34

0

Show file

for no in range(12000):

    if data[no][1] == 'negative':
        cls = data[no][3]
    else:
        cls = data[no][1]

    twi_cont = str_pre_process(data[no][10])

    struct = {'text': twi_cont, 'category': cls}
    #print twi_cont, cls
    train_twi.append(struct)

for twi in train_twi:
    trainer.train(twi['text'], twi['category'])

model = Classifier(trainer.data, tokenizer)

print "Testing..."

for no in range(12000, num_twi):
    twi_cont = str_pre_process(data[no][10])
    classification = model.classify(twi_cont)
    #print classification,
    test_twi.append(classification)

    if data[no][1] == 'negative':
        cls = data[no][3]
    else:
        cls = data[no][1]

Example #35

0

Show file

File: NaivebayesClassifier.py Project: chvrajput/Naive-Bayes-Classifier

    'text':
    'the person you are calling is  busy call again later aap jis vyakti se sampark karna chahte hain wo abhi vyast hai kripya thodi der baad call karen',
    'category': 'Busy'
}, {
    'text':
    'the airtel subscriber you have called is speaking to someone else you can wait or call again later aap jis airtel subscriber ko call kiya hai woh abhi dusri call pe vyast hai kripya pratiksha karein ya kuch',
    'category': 'Waiting'
}, {
    'text':
    'this call cannot be completed at this moment please try again later this call cannot be completed at this moment please try again later',
    'category': 'Cannot be Completed'
}, {
    'text':
    'the number you have dialled could not to count check number dial kiya gaya number maujood nahi hai kripya number check kar raha hoon',
    'category': 'Invalid'
}]
for news in newsSet:
    newsTrainer.train(news['text'], news['category'])

# When you have sufficient trained data, you are almost done and can start to use
# a classifier.
newsClassifier = Classifier(newsTrainer.data, tokenizer)

# Now you have a classifier which can give a try to classifiy text of news whose
# category is unknown, yet.
classification = newsClassifier.classify(
    "please check the number you have dial dial kiya hua number kripya jaanch")

# the classification variable holds the detected categories sorted
print(classification)

Example #36

0

Show file

File: naive_bayes.py Project: ikafire/PTT-Moderator-Helper

from naiveBayesClassifier.trainer import Trainer
from naiveBayesClassifier.classifier import Classifier

# Training
articleTrainer = Trainer(tokenizer)

articles = article.create_articles_from_file("data/HatePoliticsdata.json")
p_train = articles[0:3001]
p_test = articles[3001:3031]

for a in p_train:
    doc = a.body
    #seg_list = jieba.lcut(doc, cut_all=False)
    seg_list = jieba.analyse.extract_tags(doc)
    doc = " ".join(seg_list)
    articleTrainer.train(doc, 'politics')

articles = article.create_articles_from_file("data/Gossipingdata.json")
g_train = articles[0:3000]
g_test = articles[3001:3301]

for a in g_train:
    doc = a.body
    #seg_list = jieba.lcut(doc, cut_all=False)
    seg_list = jieba.analyse.extract_tags(doc)
    doc = " ".join(seg_list)
    articleTrainer.train(doc, 'gossiping')

#Testing
articleClassifier = Classifier(articleTrainer.data, tokenizer)
p_gossiping = 0

Example #37

0

Show file

File: GuessDisease.py Project: maheshkarriisb/DiseaseClassifer

import json, os, sys, re
from naiveBayesClassifier import tokenizer
from naiveBayesClassifier.trainer import Trainer
from naiveBayesClassifier.classifier import Classifier
##IMPORTS
'''
Usage:

    python GuessDisease.py "symptomA symptomB symptomC"
Example INPUT:
    python GuessDisease.py "agitation exhaustion vomit"
Example OUTPUT:

    {
    "disease": "influenza"
    }


'''

##SETTING UP
diseaseclassifier = Trainer(tokenizer)  #STARTS CLASIFIERS
with open("Dataset.csv", "r") as file:  #OPENS DATASET
    for i in file:  #FOR EACH LINE
        lines = file.next().split(",")  #PARSE CSV <DISEASE> <SYMPTOM>
        diseaseclassifier.train(lines[1], lines[0])  #TRAINING
diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer)
classification = diseaseclassifier.classify(sys.argv[1])  #CLASIFY INPUT
print classification[0]  #PRINT CLASIFICATION

Example #38

0

Show file

File: train.py Project: samster123/applytics

from naiveBayesClassifier.classifier import Classifier

sentimentTrainer = Trainer(tokenizer)

# Get the training dataset.
with open('training.csv', 'r') as f:
    data = f.read()
trainset = data.splitlines()

for line in trainset:
    pos1 = line.find(',"')
    pos2 = line.find('",', pos1)
    if pos1 == -1:
        pos1 = line.find(',')
        pos2 = line.find(',', pos1 + 1)
        comment = line[pos1 + 1:pos2]
        sentiment = line[pos2 + 1:]
    else:
        comment = line[pos1 + 2:pos2 - 2]
        sentiment = line[pos2 + 2:]
    sentimentTrainer.train(comment, sentiment)

# Use the classifier.
sentimentClassifier = Classifier(sentimentTrainer.data, tokenizer)

# Classify an unknown review.
unknownInstance = "I don't like the app. It crashes everytime."
classification = sentimentClassifier.classify(unknownInstance)

print classification