def classify(filename, size, url, result): trainingSet = make_training_set.create_set(filename, size) trainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = [""])) for sample in trainingSet: trainer.train(sample['url'], sample['result']) classifier = Classifier(trainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = [""])) print "Expected: " + result print classifier.classify(url)
def __call__(self, text): context = self.context request = self.request response = request.response catalog = context.portal_catalog bayesFilter = api.portal.get_registry_record( 'i8d.content.browser.coverSetting.ICoverSetting.bayesFilter') trainingSet = [] for line in bayesFilter.split('\n'): trainingSet.append({ 'category': 'hasKey', 'text': safe_unicode(line) }) trainer = Trainer(tokenizer) for record in trainingSet: trainer.train(record['text'], record['category']) classifier = Classifier(trainer.data, tokenizer) result = classifier.classify(safe_unicode(text)) import pdb pdb.set_trace()
def classifyNonClusteredJira(self): columnName = 'C' for index, row in self.df.iterrows(): clusterName = row['Labels'] keyWords = row['KeyWords'] if (clusterName in constantsObj.INITIAL_CLUSTERS): self.issueSet.append(({ "class": row['Labels'], "sentence": keyWords })) for issue in self.issueSet: self.jiraTrainer.train(issue['sentence'], issue['class']) jiraClassifier = Classifier(self.jiraTrainer.data, tokenizer) for index, row in self.df.iterrows(): clusterName = row['Labels'] keyWords = row['KeyWords'] if (clusterName not in constantsObj.INITIAL_CLUSTERS): identifiedCluster = jiraClassifier.classify( row['KeyWords']).__getitem__(0) identifiedCluster = identifiedCluster.__getitem__(0) self.issueSet.append(({ "class": identifiedCluster, "sentence": keyWords })) self.nonClusteredJirasAfterClusteringFile.write( "%s --- %s\n" % (keyWords, identifiedCluster)) '''writeIndex = columnName + str(index-2) self.activeWorkSheet[writeIndex] = identifiedCluster''' self.nonClusteredJirasAfterClusteringFile.close() return self.issueSet
def getKeywords(self, html): text = self.getHtml2Text(html) # print text text = self.zhsJieba(text) #取得registry reg = api.portal.get_registry_record('mingjing.content.browser.mjnetSetting.IMJNetSetting.catDict') trainSet = [] for item in reg: key = item.split('|||')[0] for line in reg[item].split('\n'): zhsString = self.zhsJieba(line) trainSet.append({'category': key, 'text': zhsString}) #用簡單貝氏分類文章 newsTrainer = Trainer(tokenizer) for news in trainSet: newsTrainer.train(news['text'].encode('utf-8'), news['category']) newsClassifier = Classifier(newsTrainer.data, tokenizer) classification = newsClassifier.classify(text) print classification # import pdb; pdb.set_trace() if classification[0][1] == 0.0: classification.insert(0, (u'n99', 0.0)) result = [] for item in classification: result.append(item[0]) return result
def classifyNewJiraToOneOfTheClusters(self, inputTrainingData, inputJira): for item in inputTrainingData: self.jiraTrainer.train(item['sentence'], item['class']) jiraClassifier = Classifier(self.jiraTrainer.data, tokenizer) clusterForInputJira = jiraClassifier.classify(inputJira) return clusterForInputJira
class DomainModel: training_data = [] newsTrainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) newClassifier = None def __init__(self): self.train() # TODO: Train on FB data too def train(self): with open('src/URL.csv', 'r') as csv_file: reader = csv_file.readlines() for line in reader: read_dict = {} line_split = line.split(',') if len(line_split) < 2 or len(line_split[0]) == 0: continue read_dict['text'] = line_split[0].strip() read_dict['class'] = line_split[1].strip() self.training_data.append(read_dict) #print training_data for data in self.training_data: self.newsTrainer.train(data['text'], data['class']) self.newsClassifier = Classifier(self.newsTrainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) def classify(self, unknownInstance): classification = self.newsClassifier.classify(unknownInstance) return classification
def classify(filename, size): trainingSet, testingSet = make_chronological_sets.create_sets( filename, size) trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) for sample in trainingSet: trainer.train(sample['url'], sample['result']) classifier = Classifier( trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) mal_mal = 0 mal_clean = 0 clean_clean = 0 clean_mal = 0 for sample in testingSet: predicted = classifier.classify(sample['url'])[0][0] actual = sample['result'] if predicted == 'malicious' and actual == 'malicious': mal_mal += 1 elif predicted == 'malicious' and actual == 'clean': mal_clean += 1 elif predicted == 'clean' and actual == 'clean': clean_clean += 1 elif predicted == 'clean' and actual == 'malicious': clean_mal += 1 prop_caught = float(mal_mal) / float(mal_mal + clean_mal) prop_missed = float(clean_mal) / float(mal_mal + clean_mal) ## Stuff to get proportions: # size = float(size) # mal_mal = float(mal_mal)/size # mal_clean = float(mal_clean)/size # clean_mal = float(clean_mal)/size # clean_clean = float(clean_clean)/size ## Confusion matrix stuff: # confusionMatrix = [['Actually malicious', mal_mal, clean_mal], ['Actually clean', mal_clean, clean_clean]] # print tabulate(confusionMatrix, headers=['', 'Predicted malicious', 'Predicted clean']) print "Total: " + str(mal_mal + mal_clean + clean_mal + clean_clean) print "Malware: " + str(mal_mal + clean_mal) print "Clean: " + str(mal_clean + clean_clean) print "Caught: " + str(mal_mal) + " (" + "{:.1%}".format(prop_caught) + ")" print "Missed: " + str(clean_mal) + " (" + "{:.1%}".format( prop_missed) + ")"
class NaiveBayesClassifier: def __init__(self): jieba.set_dictionary('dict.big.txt') self.articleTrainer = Trainer(tokenizer) def train(self): # Training articles = article.create_articles_from_file("data/HatePoliticsdata.json") p_train = articles[0:3001] p_test = articles[3001:3031] for a in p_train: doc = a.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) self.articleTrainer.train(doc, 'politics') articles = article.create_articles_from_file("data/Gossipingdata.json") g_train = articles[0:3000] g_test = articles[3001:3301] for a in g_train: doc = a.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) self.articleTrainer.train(doc, 'gossiping') f = open('data/docCountOfClasses.json', 'w', -1, 'utf-8') f.write(json.dumps(self.articleTrainer.data.docCountOfClasses)) f.close() f = open('data/frequencies.json', 'w', -1, 'utf-8') f.write(json.dumps(self.articleTrainer.data.frequencies)) f.close() def classify(self, article): self.data = TrainedData() f = open('data/docCountOfClasses.json', 'r', -1, 'utf-8') self.data.docCountOfClasses = json.load(f) f.close() f = open('data/frequencies.json', 'r', -1, 'utf-8') self.data.frequencies = json.load(f) f.close() #Testing self.articleClassifier = Classifier(self.data, tokenizer) doc = article.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) classification = self.articleClassifier.classify(doc) return classification[0][0]
def classify(filename, size): trainingSet, testingSet = make_balanced_sets.create_sets(filename, size) trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) for sample in trainingSet: trainer.train(sample['url'], sample['result']) classifier = Classifier( trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) mal_mal = 0 mal_clean = 0 clean_clean = 0 clean_mal = 0 for sample in testingSet: predicted = classifier.classify(sample['url'])[0][0] actual = sample['result'] if predicted == 'malicious' and actual == 'malicious': mal_mal += 1 elif predicted == 'malicious' and actual == 'clean': mal_clean += 1 elif predicted == 'clean' and actual == 'clean': clean_clean += 1 elif predicted == 'clean' and actual == 'malicious': clean_mal += 1 size = float(size) mal_mal = float(mal_mal) / size mal_clean = float(mal_clean) / size clean_mal = float(clean_mal) / size clean_clean = float(clean_clean) / size confusionMatrix = [[mal_mal, clean_mal], [mal_clean, clean_clean]] pprint(confusionMatrix) print "Accuracy: " + str(mal_mal + clean_clean) print "False positives (predicted clean when malicious): " + str(clean_mal) print "False negatives (predicted malicious when clean): " + str(mal_clean)
def neyronka(self, _str): newsTrainer = Trainer(tokenizer) with open('o', 'rt', encoding='utf8') as csvfile: res = '[' for i in csvfile.readlines(): if i == '\n': continue else: theme, text = i.split('***') res += '{\'text\':' + '\'' + text.strip() + '\'' + ', ' + '\'category\':' + '\'' + str( theme) + '\'},\n' res += ']' newsSet = eval(res) for news in newsSet: newsTrainer.train(news['text'], news['category']) newsClassifier = Classifier(newsTrainer.data, tokenizer) unknownInstance = _str classification = newsClassifier.classify(unknownInstance) return (sorted(classification, key=(lambda x: -x[1])))
def tweet_classification(unknownInstance): newsTrainer = Trainer(tokenizer) with open("train.txt") as f: for line in f: str = line str = str.split(' ', 1 ); newsTrainer.train(str[1], str[0]) newsClassifier = Classifier(newsTrainer.data, tokenizer) # Now you have a classifier which can give a try to classifiy text of news whose # category is unknown, yet. classification = newsClassifier.classify(unknownInstance) # the classification variable holds the possible categories sorted by # their probablity value ans = dict() for i in range(3): if(classification[0][1]!=0.0): ans[classification[i][0]] = classification[i][1] / classification[0][1]; #print classification #print ans return ans
def determine(sentence): newsTrainer = Trainer(tokenizer) newsSet = [] with open('data.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: newsSet.append({'fact': row['Fact'], 'decision': row['Decision']}) for news in newsSet: newsTrainer.train(news['fact'], news['decision']) newsClassifier = Classifier(newsTrainer.data, tokenizer) classification = newsClassifier.classify(sentence) # False false = classification[0][1] false = str(false).split('.')[0] # True true = classification[1][1] true = str(true).split('.')[0] data = [true, false] return data
class DomainModel: data_interface = [] newsTrainer = Trainer( tokenizer.Tokenizer(stop_words=[], signs_to_remove=["?!#%&"])) newClassifier = None def __init__(self, data_interface): """ Constructor: Store data interface on creation, Don't train yet, let parent decide when """ if not isinstance(data_interface, Data): raise ValueError( "Data is not properly interfaced through class Data") self.data_interface = data_interface def train(self): """Train on base and FB data""" # Run through each training example in data interface and # feed them into model for data_point in self.data_interface.arr: data_class = data_point[2].strip() # Class is "Credibility" data_text = data_point[4].strip() # Text is "Content URL" self.newsTrainer.train(data_text, data_class) self.newsClassifier = Classifier(self.newsTrainer.data, \ tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) def classify(self, unknownInstance): classification = self.newsClassifier.classify(unknownInstance) return classification
def classify(input): twitter = Twitter() f = open("data.txt", "r") data = json.loads(f.read()) gradeTrainer = Trainer(tokenizer) loadTrainer = Trainer(tokenizer) lectureTrainer = Trainer(tokenizer) print("Training grade ...") for subject in data: if subject["grade"] != "?": review = subject["comment"].replace('.', '\n').split("\n") for li in review: if len(li.strip()) != 0: gradeTrainer.train(li, subject["grade"]) print("Training load ...") for subject in data: if subject["load"] != "?": review = subject["comment"].replace('.', '\n').split("\n") for li in review: if len(li.strip()) != 0: loadTrainer.train(li, subject["load"]) print("Training lecture ...") for subject in data: if subject["lecture"] != "?": review = subject["comment"].replace('.', '\n').split("\n") for li in review: if len(li.strip()) != 0: lectureTrainer.train(li, subject["lecture"]) gradeClassifier = Classifier(gradeTrainer.data, tokenizer) loadClassifier = Classifier(loadTrainer.data, tokenizer) lectureClassifier = Classifier(lectureTrainer.data, tokenizer) input = u"" + input classify_input = [] for element in twitter.pos(input): if element[1] == 'Noun': classify_input.append(element[0]) elif element[1] == 'Verb': classify_input.append(element[0]) elif element[1] == 'Adjective': classify_input.append(element[0]) elif element[1] == 'Adverb': classify_input.append(element[0]) elif element[1] == 'Exclamation': classify_input.append(element[0]) elif element[1] == 'Alpha': classify_input.append(element[0]) elif element[1] == 'KoreanParticle': classify_input.append(element[0]) text = " ".join(classify_input) print(text) gradeClassification = gradeClassifier.classify(text) loadClassification = loadClassifier.classify(text) lectureClassification = lectureClassifier.classify(text) print( "\n________________________________________GRADE________________________________________\n" ) print(gradeClassification) print( "\n________________________________________LOAD_________________________________________\n" ) print(loadClassification) print( "\n________________________________________LECTURE______________________________________\n" ) print(lectureClassification) return gradeClassification, loadClassification, lectureClassification
def get(self): try: print " " print "TestClassifier start" print " " # pasar los stop words a lista desde el file with open("stop_words.txt", "r") as ins: array = [] for line in ins: array.append((line.rstrip('\n')).decode('unicode-escape')) #print array newsTrainer = Trainer( tokenizer.Tokenizer(stop_words=array, signs_to_remove=["?!#%&_"])) hoy = date.today() query = News3.query(News3.date == hoy, News3.news_from.IN([ "uy_press", ]), News3.category == "Política") # You need to train the system passing each text one by one to the trainer module. #newsSet =[ # {'text': 'not to eat too much is not enough to lose weight', 'category': 'health'}, # {'text': 'Russia try to invade Ukraine', 'category': 'politics'}, # {'text': 'do not neglect exercise', 'category': 'health'}, # {'text': 'Syria is the main issue, Obama says', 'category': 'politics'}, # {'text': 'eat to lose weight', 'category': 'health'}, # {'text': 'you should not eat much', 'category': 'health'} #] query2 = News3.query(News3.date == hoy, News3.news_from == "uy_press", News3.category == "deportes") query4 = News3.query(News3.date == hoy, News3.news_from == "uy_press", News3.category == "salud") #for news in newsSet: # newsTrainer.train(news['text'], news['category']) c = 0 #print query for i in query: print " " print i.category newsTrainer.train(i.html, 'politica') #if c == 10: break c += 1 #for i in query2: # newsTrainer.train(i.html, 'deportes') #raise Exception('I know Python!') #for i in query4: # newsTrainer.train(i.html, 'salud') # When you have sufficient trained data, you are almost done and can start to use # a classifier. # Now you have a classifier which can give a try to classifiy text of news whose # category is unknown, yet. query3 = News3.query( News3.date == hoy, News3.news_from.IN([ "el_pais", ]), News3.id.IN([0]), ) ### newsClassifier = Classifier( newsTrainer.data, tokenizer.Tokenizer(stop_words=array, signs_to_remove=["?!#%&"])) #print unknownInstance classification = newsClassifier.classify( "Vidalín: No quiero que me llamen para saber qué tramite hay que hacer para poner un prostíbulo" ) # the classification variable holds the detected categories sorted print " classification " print(classification) except: print traceback.format_exc()
doc = " ".join(seg_list) articleTrainer.train(doc, 'gossiping') #Testing articleClassifier = Classifier(articleTrainer.data, tokenizer) p_gossiping = 0 p_politics = 0 g_gossiping = 0 g_politics = 0 for a in p_test: doc = a.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) classification = articleClassifier.classify(doc) if classification[0][0] == 'gossiping': p_gossiping += 1 else: p_politics += 1 for a in g_test: doc = a.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) classification = articleClassifier.classify(doc) if classification[0][0] == 'gossiping': g_gossiping += 1 else: g_politics += 1
'text': 'the person you are calling is busy call again later aap jis vyakti se sampark karna chahte hain wo abhi vyast hai kripya thodi der baad call karen', 'category': 'Busy' }, { 'text': 'the airtel subscriber you have called is speaking to someone else you can wait or call again later aap jis airtel subscriber ko call kiya hai woh abhi dusri call pe vyast hai kripya pratiksha karein ya kuch', 'category': 'Waiting' }, { 'text': 'this call cannot be completed at this moment please try again later this call cannot be completed at this moment please try again later', 'category': 'Cannot be Completed' }, { 'text': 'the number you have dialled could not to count check number dial kiya gaya number maujood nahi hai kripya number check kar raha hoon', 'category': 'Invalid' }] for news in newsSet: newsTrainer.train(news['text'], news['category']) # When you have sufficient trained data, you are almost done and can start to use # a classifier. newsClassifier = Classifier(newsTrainer.data, tokenizer) # Now you have a classifier which can give a try to classifiy text of news whose # category is unknown, yet. classification = newsClassifier.classify( "please check the number you have dial dial kiya hua number kripya jaanch") # the classification variable holds the detected categories sorted print(classification)
# {'question': 'Is there a 24 hour Customer Contact Centre?', # 'answer': 'Yes, we have a 24 Hour Customer Contact Centre where you can get support related to your banking enquiries. You can call the numbers: +263 772 244 788, +263...'}, # {'question': 'Is there a way I can check my account balance other than contacting the branch?', # 'answer': 'Yes, you can check your balance through our ATM network, NMBMobile App or Internet Banking.'}, # {'question': 'What is an e-Statement?', # 'answer': 'An e-Statement is an electronic version of your paper bank statement which is emailed directly to your registered email address in a password protected PDF ...'}, # {'question': 'How can I transfer money to a bank account abroad?', # 'answer': 'This service is currently available for Corporate clients only, subject to availability of funds and the RBZ priority payments list'}, # {'question': 'How do I get internal funds transfer forms?', # 'answer': 'We are no longer accepting paper transfer forms, please register for Mobile Banking at your nearest branch or enrol for Internet Banking here.'}, # {'question': 'How can I get a Point of Sale Machine for my business?', # 'answer': 'You can submit an application letter detailing the following : Business name Bank account number Nature of business Contact person & number Number...'} # ] for news in newsSet: newsTrainer.train(news['question'], news['answer']) # When you have sufficient trained data, you are almost done and can start to use # a classifier. newsClassifier = Classifier(newsTrainer.data, token) # print(newsClassifier.accuracy(newsSet)) # Now you have a classifier which can give a try to classifiy text of news whose # category is unknown, yet. classification = newsClassifier.classify("f**k") # the classification variable holds the detected categories sorted for cl in classification[:5]: print(cl)
def classify(filename, size): trainingSet = make_training_set.create_set(filename, size) trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) mal_mal = 0 mal_clean = 0 clean_clean = 0 clean_mal = 0 trainer.train(trainingSet[0]['url'], trainingSet[0]['result']) classifier = Classifier( trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) out = open("mislabeled.txt", "w") for sample in trainingSet[1:]: predicted = classifier.classify(sample['url'])[0][0] actual = sample['result'] if predicted == 'malicious' and actual == 'malicious': mal_mal += 1 elif predicted == 'malicious' and actual == 'clean': mal_clean += 1 elif predicted == 'clean' and actual == 'clean': clean_clean += 1 elif predicted == 'clean' and actual == 'malicious': out.write(sample['url'] + '\n') clean_mal += 1 trainer.train(sample['url'], sample['result']) classifier = Classifier( trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) total = float(mal_mal + mal_clean + clean_mal + clean_clean) prop_caught = float(mal_mal + clean_clean) / total prop_missed = float(clean_mal + mal_clean) / total false_positive = float(clean_mal) / float(mal_mal + clean_mal) ## Stuff to get proportions: # size = float(size) # mal_mal = float(mal_mal)/size # mal_clean = float(mal_clean)/size # clean_mal = float(clean_mal)/size # clean_clean = float(clean_clean)/size ## Confusion matrix stuff: # confusionMatrix = [['Actually malicious', mal_mal, clean_mal], ['Actually clean', mal_clean, clean_clean]] # print tabulate(confusionMatrix, headers=['', 'Predicted malicious', 'Predicted clean']) print "Total: " + str(int(total)) print "Malware: " + str(mal_mal + clean_mal) print "Clean: " + str(mal_clean + clean_clean) print "Caught: " + str(mal_mal + clean_clean) + " (" + "{:.1%}".format( prop_caught) + " of all samples)" print "Missed: " + str(clean_mal + mal_clean) + " (" + "{:.1%}".format( prop_missed) + " of all samples)" print "Malicious missed: " + str(clean_mal) + " (" + "{:.1%}".format( false_positive) + " of all malicious samples)"
}, { 'text': 'Hate', 'category': 'negative' }] countPos = 0 countNeg = 0 for string, klass in stringList: if klass == '1': countPos += 1 newsSet.append({'text': string, 'category': 'positive'}) if klass == '0': countNeg += 1 newsSet.append({'text': string, 'category': 'negative'}) # Train Model for news in newsSet: newsTrainer.train(news['text'], news['category']) # Classify Known Data newsClassifier = Classifier(newsTrainer.data, tokenizer) # Classify Unknown Data str1 = "amazing good stuff" classification = newsClassifier.classify(str1) print(str1, classification) str2 = "hate this shit" classification = newsClassifier.classify(str2) print(str2, classification)
'text': 'not to eat too much is not enough to lose weight', 'category': 'health' }, { 'text': 'Russia try to invade Ukraine', 'category': 'politics' }, { 'text': 'do not neglect exercise', 'category': 'health' }, { 'text': 'Syria is the main issue, Obama says', 'category': 'politics' }, { 'text': 'eat to lose weight', 'category': 'health' }, { 'text': 'you should not eat much', 'category': 'health' }] for news in newsSet: newsTrainer.train(news['text'], news['category']) # When you have sufficient trained data, you are almost done and can start to use # a classifier. newsClassifier = Classifier(newsTrainer.data, tokenizer) # Now you have a classifier which can give a try to classifiy text of news whose # category is unknown, yet. classification = newsClassifier.classify("Obama is") # the classification variable holds the detected categories sorted print(classification)
def api_echo(): if request.method == 'POST': # create stemmer factory = StemmerFactory() stemmer = factory.create_stemmer() factory = StopWordRemoverFactory() more_stopword = [] # add stopword with open('dataset/stopword.csv') as csvfile: readCSV = csv.reader(csvfile, delimiter=',') for row in readCSV: more_stopword.append(row[0]) dictionary = ArrayDictionary(more_stopword) str = StopWordRemover(dictionary) newsTrainer = Trainer(tokenizer) kesehatan = [] konsultasi = [] marketing = [] with open("dataset/kesehatan.txt", "r") as ins: for line in ins: kesehatan.append({ 'text': line.rstrip(), 'category': 'kesehatan' }) with open("dataset/konsultasi.txt", "r") as ins: for line in ins: konsultasi.append({ 'text': line.rstrip(), 'category': 'konsultasi' }) with open("dataset/marketing.txt", "r") as ins: for line in ins: marketing.append({ 'text': line.rstrip(), 'category': 'marketing' }) # You need to train the system passing each text one by one to the trainer module. newsSet = kesehatan + konsultasi + marketing for news in newsSet: newsTrainer.train(news['text'], news['category']) # When you have sufficient trained data, you are almost done and can start to use # a classifier. newsClassifier = Classifier(newsTrainer.data, tokenizer) query = request.form['query'].encode("utf8") #query = "Apa saja level bonus yang didapat bagi seorang agen?" # stemming and remove stop word on Query out = stemmer.stem(query) out = str.remove(out) classification = newsClassifier.classify(out) # the classification variable holds the detected categories sorted #return classification[0][0] return jsonify(classification)
tosTrainer = Trainer(tokenizer) def get_corp(read_file): with open(read_file,"r") as r: corpus = [] for line in r: tabsep = line.decode('utf-8').strip().split('\t') a = {} a['text'] = tabsep[0] a['rating'] = tabsep[1] corpus.append(a) return corpus # get the corpus from a training set - using copyright clauses here as an example (a subset of the csv generated by the getpointsdata.py script) tosSet = get_corp("tosdr.org/copyrighttrainset.txt") # You need to train the system passing each text one by one to the trainer module. for corpi in tosSet: tosTrainer.train(corpi['text'], corpi['rating']) # When you have sufficient trained data, you are almost done and can start to use a classifier. tosClassifier = Classifier(tosTrainer.data, tokenizer) # Now you have a classifier which can give a try to classifiy text of policy clauses whose rating is unknown, yet. Example here drawn from test set unknownInstance = "You are free to choose your own copyright license for your content in your account settings: Public Domain Creative Commons non commercial or free licenses but also classic copyright if you wish so." classification = tosClassifier.classify(unknownInstance) # the classification variable holds the possible categories sorted by their probablity value print classification
class Classifier(object): """Custom class to implement naive bayes classification using naiveBayesClassifier. Attributes: classifier (Classifier object): Object of class `Classifier` for classifying transactios based on existing journal. """ class NotImplemented(Exception): pass def __init__(self, journal=None): """Classifer initialization. Parameters: journal_file (str): Journal file string to import. """ self._tknizer = tokenizer.Tokenizer(signs_to_remove=['?!%.']) self._trainer = Trainer(self._tknizer) if journal is not None: journal_data = train_journal(journal) for group in journal_data: # 0: Allocation account. # 1: List of transactions. # 2: Greatest common multiple of values in transactions. for transaction in group[1]: # 0: Transaction payee string. # 1: Allocation account. self._trainer.train(transaction[0], transaction[1]) self._classifier = BayesClassifier( self._trainer.data, self._tknizer ) else: self._classifier = None def update(self, text, category): """Update training data with new examples. Adds new data to the trainer then generates a new classifier. Can be useful for updating on the fly if performing an interactive data import. Parameters: text (str): New text to classify. category (str): Classification of `text`. """ self._trainer.train(text, category) self._classifier = BayesClassifier( self._trainer.data, self._tknizer ) def classify(self, text, method='bayes'): """Give classifcation for a text string using bayes classification. Parameters: text (str): Text to classify. method (str): Type of classification to use. Default to `bayes`. Returns: list: Available categories and their probabilities. """ if method == 'bayes': if self._classifier is not None: return self._classifier.classify(text) else: return None elif method == 'rules': raise NotImplementedError( 'Classification based on rules file not yet implemented' ) else: raise NotImplemented('The method `{}` is not valid'.format(method))
twi_cont = str_pre_process(data[no][10]) struct = {'text': twi_cont, 'category': cls} #print twi_cont, cls train_twi.append(struct) for twi in train_twi: trainer.train(twi['text'], twi['category']) model = Classifier(trainer.data, tokenizer) print "Testing..." for no in range(12000, num_twi): twi_cont = str_pre_process(data[no][10]) classification = model.classify(twi_cont) #print classification, test_twi.append(classification) if data[no][1] == 'negative': cls = data[no][3] else: cls = data[no][1] true_cls.append(cls) if classification[0][0] == cls: match += 1 sum = len(true_cls) accuracy = match * 1.0 / sum
from naiveBayesClassifier.classifier import Classifier newsTrainer = Trainer(tokenizer) newsSet = [{ 'text': 'not to eat too much is not enough to lose weight', 'category': 'health' }, { 'text': 'Russia try to invade Ukraine', 'category': 'politics' }, { 'text': 'do not neglect exercise', 'category': 'health' }, { 'text': 'Syria is the main issue, Obama says', 'category': 'politics' }, { 'text': 'eat to lose weight', 'category': 'health' }, { 'text': 'you should not eat much', 'category': 'health' }] for news in newsSet: newsTrainer.train(news['text'], news['category']) newsClassifier = Classifier(newsTrainer.data, tokenizer) classification = newsClassifier.classify("eat more, you will become fatter") print(classification)
# category is unknown, yet. unknownInstance = "" #print accuracy on each category k = 0 for j in range(0, len(filelist1)): my_test_list = open(filelist1[j], 'r').read().split('\n') my_test_list = list(set(my_test_list)) l = len(my_test_list) count = 0 #counter = [0,0,0,0,0] for i in range(0, l): truth_list.append(label[k]) classification = tweetClassifier.classify(my_test_list[i]) pred_list.append(classification[0][0]) if (classification[0][0] == label[k]): count = count + 1 #if(classification[0][0] == "sports"): counter[0] = counter[0] + 1 #if(classification[0][0] == "tech"): counter[1] = counter[1] + 1 #if(classification[0][0] == "fnl"): counter[2] = counter[2] + 1 #if(classification[0][0] == "business"): counter[3] = counter[3] + 1 #if(classification[0][0] == "politics"): counter[4] = counter[4] + 1 #for m in range(0,len(counter)) : #if(m!=k): incorrect_class_fn += counter[m] print count print l print label[k]
You want to train a system with this pre-categorized/pre-classified texts. So, you have better call this data your training set. """ from naiveBayesClassifier import tokenizer from naiveBayesClassifier.trainer import Trainer from naiveBayesClassifier.classifier import Classifier newsTrainer = Trainer(tokenizer) # You need to train the system passing each text one by one to the trainer module. newsSet =[ {'text': 'not to eat too much is not enough to lose weight', 'category': 'health'}, {'text': 'Russia try to invade Ukraine', 'category': 'politics'}, {'text': 'do not neglect exercise', 'category': 'health'}, {'text': 'Syria is the main issue, Obama says', 'category': 'politics'}, {'text': 'eat to lose weight', 'category': 'health'}, {'text': 'you should not eat much', 'category': 'health'} ] for news in newsSet: newsTrainer.train(news['text'], news['category']) # When you have sufficient trained data, you are almost done and can start to use # a classifier. newsClassifier = Classifier(newsTrainer.data, tokenizer) # Now you have a classifier which can give a try to classifiy text of news whose # category is unknown, yet. classification = newsClassifier.classify("Obama is") # the classification variable holds the detected categories sorted print(classification)
documentTrainer = Trainer(tokenizer) documentSet = [] def getTextBasedOnDocumentID(documentID): ID = int(documentID.split('_')[1]) line = linecache.getline('../2.document_set/document_set.csv', ID + 2) text = line.split(',"')[1] return text for i in range(0, len(traincsv)): documentSet.append({ 'text': getTextBasedOnDocumentID(traincsv[i][0]), 'category': traincsv[i][1] }) for documents in documentSet: documentTrainer.train(documents['text'], documents['category']) newsClassifier = Classifier(documentTrainer.data, tokenizer) for i in range(0, len(testcsv)): data = getTextBasedOnDocumentID(testcsv[i][0]) classification = newsClassifier.classify(data) testcsv[i][1] = int(classification[0][0]) df = pd.DataFrame(testcsv) df.to_csv("../5.evaluation_file/predicted_cat.csv", index=False) #np.savetxt("./5.evaluation_file/predicted_cat.csv", testcsv,header="document_id,category" ,delimiter=",")
elif element[1] == 'Adjective': classify_input.append(element[0]) elif element[1] == 'Adverb': classify_input.append(element[0]) elif element[1] == 'Exclamation': classify_input.append(element[0]) elif element[1] == 'Alpha': classify_input.append(element[0]) elif element[1] == 'KoreanParticle': classify_input.append(element[0]) text = " ".join(classify_input) print(text) gradeClassification = gradeClassifier.classify(text) loadClassification = loadClassifier.classify(text) lectureClassification = lectureClassifier.classify(text) print( "\n________________________________________GRADE________________________________________\n" ) print(gradeClassification) print( "\n________________________________________LOAD_________________________________________\n" ) print(loadClassification) print( "\n________________________________________LECTURE______________________________________\n" ) print(lectureClassification)
{'symptoms': 'unresponsiveness', 'disease': 'dementia'}, {'symptoms': 'lethargy', 'disease': 'dementia'}, {'symptoms': 'agitation', 'disease': 'dementia'}, {'symptoms': 'ecchymosis', 'disease': 'dementia'}, {'symptoms': 'syncope', 'disease': 'dementia'}, {'symptoms': 'rale', 'disease': 'dementia'}, {'symptoms': 'unconscious state', 'disease': 'dementia'}, {'symptoms': 'cough', 'disease': 'dementia'}, {'symptoms': 'bedridden', 'disease': 'dementia'}, {'symptoms': 'unsteady gait', 'disease': 'dementia'}, ] for news in newsSet: newsTrainer.train(news['symptoms'], news['disease']) # When you have sufficient trained data, you are almost done and can start to use # a classifier. newsClassifier = Classifier(newsTrainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) # Now you have a classifier which can give a try to classifiy text of news whose # category is unknown, yet. unknownInstance = "pain fever coughing" classification = newsClassifier.classify(unknownInstance) # the classification variable holds the possible categories sorted by # their probablity value print(classification)
class DomainModel: training_data = [] newsTrainer = Trainer( tokenizer.Tokenizer(stop_words=[], signs_to_remove=["?!#%&"])) newClassifier = None def __init__(self): self.train() def train(self): """Train on base and FB data""" with open('res/data/base_data.csv', 'r') as csv_file: reader = csv.reader(csv_file) i = 0 for line in reader: i += 1 line_split = line read_dict = {} if i == 1 or len(line_split) <= 2 or len(line_split[0]) == 0: continue read_dict['class'] = line_split[2].strip() # Accounting for our inconsistency in Spreadsheet if read_dict["class"] == "Real": read_dict['text'] = line_split[6].strip() else: read_dict['text'] = line_split[5].strip() print(read_dict) self.training_data.append(read_dict) print('---->>>>>><<<<<<<-------') with open('res/data/fb_data.csv', 'r') as csv_file: reader = csv.reader(csv_file) i = 0 for line in reader: i += 1 line_split = line read_dict = {} if i == 1 or len(line_split) <= 2: continue read_dict['class'] = line_split[2].strip() read_dict['text'] = line_split[5].strip() print(read_dict) self.training_data.append(read_dict) #print training_data for data in self.training_data: self.newsTrainer.train(data['text'], data['class']) self.newsClassifier = Classifier( self.newsTrainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=["?!#%&"])) def classify(self, unknownInstance): classification = self.newsClassifier.classify(unknownInstance) return classification
from naiveBayesClassifier.classifier import Classifier sentimentTrainer = Trainer(tokenizer) # Get the training dataset. with open('training.csv', 'r') as f: data = f.read() trainset = data.splitlines() for line in trainset: pos1 = line.find(',"') pos2 = line.find('",', pos1) if pos1 == -1: pos1 = line.find(',') pos2 = line.find(',', pos1 + 1) comment = line[pos1 + 1:pos2] sentiment = line[pos2 + 1:] else: comment = line[pos1 + 2:pos2 - 2] sentiment = line[pos2 + 2:] sentimentTrainer.train(comment, sentiment) # Use the classifier. sentimentClassifier = Classifier(sentimentTrainer.data, tokenizer) # Classify an unknown review. unknownInstance = "I don't like the app. It crashes everytime." classification = sentimentClassifier.classify(unknownInstance) print classification
for applicant in reader: # print(applicant['nm'], applicant['kind_code']) ApplicantTrainer.train(text=applicant['nm'], className=applicant['kind_code']) ApplicantClassifier = Classifier(ApplicantTrainer.data, tokenizer) # classification = ApplicantClassifier.classify("sam univ") # print(classification) results = [] # with open('data/nm_test.txt') as testfile: # for applicant in testfile: # classification = np.array(ApplicantClassifier.classify(applicant)) # # print(applicant, classification[:, 1], classification[0][0]) # results.append((applicant, classification[0][0])) # print(applicant) # # print(results[:3]) results = [] with open(arg_test, newline='', mode='rt') as testfile: for line in testfile: # print(line.strip()) classification = np.array(ApplicantClassifier.classify(line.strip())) # print(classification[0][0]) results.append((line.strip(), classification[0][0])) with open(arg_result, newline='', mode='w', encoding='utf-8') as write_file: writer = csv.writer(write_file, delimiter='\t') writer.writerows(results)