def __call__(self, text): context = self.context request = self.request response = request.response catalog = context.portal_catalog bayesFilter = api.portal.get_registry_record( 'i8d.content.browser.coverSetting.ICoverSetting.bayesFilter') trainingSet = [] for line in bayesFilter.split('\n'): trainingSet.append({ 'category': 'hasKey', 'text': safe_unicode(line) }) trainer = Trainer(tokenizer) for record in trainingSet: trainer.train(record['text'], record['category']) classifier = Classifier(trainer.data, tokenizer) result = classifier.classify(safe_unicode(text)) import pdb pdb.set_trace()
def getKeywords(self, html): text = self.getHtml2Text(html) # print text text = self.zhsJieba(text) #取得registry reg = api.portal.get_registry_record('mingjing.content.browser.mjnetSetting.IMJNetSetting.catDict') trainSet = [] for item in reg: key = item.split('|||')[0] for line in reg[item].split('\n'): zhsString = self.zhsJieba(line) trainSet.append({'category': key, 'text': zhsString}) #用簡單貝氏分類文章 newsTrainer = Trainer(tokenizer) for news in trainSet: newsTrainer.train(news['text'].encode('utf-8'), news['category']) newsClassifier = Classifier(newsTrainer.data, tokenizer) classification = newsClassifier.classify(text) print classification # import pdb; pdb.set_trace() if classification[0][1] == 0.0: classification.insert(0, (u'n99', 0.0)) result = [] for item in classification: result.append(item[0]) return result
def classifyNonClusteredJira(self): columnName = 'C' for index, row in self.df.iterrows(): clusterName = row['Labels'] keyWords = row['KeyWords'] if (clusterName in constantsObj.INITIAL_CLUSTERS): self.issueSet.append(({ "class": row['Labels'], "sentence": keyWords })) for issue in self.issueSet: self.jiraTrainer.train(issue['sentence'], issue['class']) jiraClassifier = Classifier(self.jiraTrainer.data, tokenizer) for index, row in self.df.iterrows(): clusterName = row['Labels'] keyWords = row['KeyWords'] if (clusterName not in constantsObj.INITIAL_CLUSTERS): identifiedCluster = jiraClassifier.classify( row['KeyWords']).__getitem__(0) identifiedCluster = identifiedCluster.__getitem__(0) self.issueSet.append(({ "class": identifiedCluster, "sentence": keyWords })) self.nonClusteredJirasAfterClusteringFile.write( "%s --- %s\n" % (keyWords, identifiedCluster)) '''writeIndex = columnName + str(index-2) self.activeWorkSheet[writeIndex] = identifiedCluster''' self.nonClusteredJirasAfterClusteringFile.close() return self.issueSet
def classifyNewJiraToOneOfTheClusters(self, inputTrainingData, inputJira): for item in inputTrainingData: self.jiraTrainer.train(item['sentence'], item['class']) jiraClassifier = Classifier(self.jiraTrainer.data, tokenizer) clusterForInputJira = jiraClassifier.classify(inputJira) return clusterForInputJira
def train(self): """Train on base and FB data""" with open('res/data/base_data.csv', 'r') as csv_file: reader = csv.reader(csv_file) i = 0 for line in reader: i += 1 line_split = line read_dict = {} if i == 1 or len(line_split) <= 2 or len(line_split[0]) == 0: continue read_dict['class'] = line_split[2].strip() # Accounting for our inconsistency in Spreadsheet if read_dict["class"] == "Real": read_dict['text'] = line_split[6].strip() else: read_dict['text'] = line_split[5].strip() print(read_dict) self.training_data.append(read_dict) print('---->>>>>><<<<<<<-------') with open('res/data/fb_data.csv', 'r') as csv_file: reader = csv.reader(csv_file) i = 0 for line in reader: i += 1 line_split = line read_dict = {} if i == 1 or len(line_split) <= 2: continue read_dict['class'] = line_split[2].strip() read_dict['text'] = line_split[5].strip() print(read_dict) self.training_data.append(read_dict) #print training_data for data in self.training_data: self.newsTrainer.train(data['text'], data['class']) self.newsClassifier = Classifier( self.newsTrainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=["?!#%&"]))
def classify(filename, size): trainingSet, testingSet = make_chronological_sets.create_sets( filename, size) trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) for sample in trainingSet: trainer.train(sample['url'], sample['result']) classifier = Classifier( trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) mal_mal = 0 mal_clean = 0 clean_clean = 0 clean_mal = 0 for sample in testingSet: predicted = classifier.classify(sample['url'])[0][0] actual = sample['result'] if predicted == 'malicious' and actual == 'malicious': mal_mal += 1 elif predicted == 'malicious' and actual == 'clean': mal_clean += 1 elif predicted == 'clean' and actual == 'clean': clean_clean += 1 elif predicted == 'clean' and actual == 'malicious': clean_mal += 1 prop_caught = float(mal_mal) / float(mal_mal + clean_mal) prop_missed = float(clean_mal) / float(mal_mal + clean_mal) ## Stuff to get proportions: # size = float(size) # mal_mal = float(mal_mal)/size # mal_clean = float(mal_clean)/size # clean_mal = float(clean_mal)/size # clean_clean = float(clean_clean)/size ## Confusion matrix stuff: # confusionMatrix = [['Actually malicious', mal_mal, clean_mal], ['Actually clean', mal_clean, clean_clean]] # print tabulate(confusionMatrix, headers=['', 'Predicted malicious', 'Predicted clean']) print "Total: " + str(mal_mal + mal_clean + clean_mal + clean_clean) print "Malware: " + str(mal_mal + clean_mal) print "Clean: " + str(mal_clean + clean_clean) print "Caught: " + str(mal_mal) + " (" + "{:.1%}".format(prop_caught) + ")" print "Missed: " + str(clean_mal) + " (" + "{:.1%}".format( prop_missed) + ")"
def train(self): """Train on base and FB data""" # Run through each training example in data interface and # feed them into model for data_point in self.data_interface.arr: data_class = data_point[2].strip() # Class is "Credibility" data_text = data_point[4].strip() # Text is "Content URL" self.newsTrainer.train(data_text, data_class) self.newsClassifier = Classifier(self.newsTrainer.data, \ tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"]))
def classify(filename, size, url, result): trainingSet = make_training_set.create_set(filename, size) trainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = [""])) for sample in trainingSet: trainer.train(sample['url'], sample['result']) classifier = Classifier(trainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = [""])) print "Expected: " + result print classifier.classify(url)
def __init__(self, journal=None): """Classifer initialization. Parameters: journal_file (str): Journal file string to import. """ self._tknizer = tokenizer.Tokenizer(signs_to_remove=['?!%.']) self._trainer = Trainer(self._tknizer) if journal is not None: journal_data = train_journal(journal) for group in journal_data: # 0: Allocation account. # 1: List of transactions. # 2: Greatest common multiple of values in transactions. for transaction in group[1]: # 0: Transaction payee string. # 1: Allocation account. self._trainer.train(transaction[0], transaction[1]) self._classifier = BayesClassifier( self._trainer.data, self._tknizer ) else: self._classifier = None
class DomainModel: training_data = [] newsTrainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) newClassifier = None def __init__(self): self.train() # TODO: Train on FB data too def train(self): with open('src/URL.csv', 'r') as csv_file: reader = csv_file.readlines() for line in reader: read_dict = {} line_split = line.split(',') if len(line_split) < 2 or len(line_split[0]) == 0: continue read_dict['text'] = line_split[0].strip() read_dict['class'] = line_split[1].strip() self.training_data.append(read_dict) #print training_data for data in self.training_data: self.newsTrainer.train(data['text'], data['class']) self.newsClassifier = Classifier(self.newsTrainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) def classify(self, unknownInstance): classification = self.newsClassifier.classify(unknownInstance) return classification
def train_spam_texts(): # Reading dataset file dataset_lang = "ru" dataset_file = codecs.open(os.path.abspath(os.curdir) + "/data/assets/spam_texts.json", "r", "utf_8_sig") dataset_data = json.load(dataset_file) # Preparing adverts spam dataset prepared_dataset = [] for idx, item in enumerate(dataset_data[dataset_lang]["adverts"]): prepared_dataset.append({ "text": item["text"], "category": "adverts" }) # Training # (Will be replaced by another library soon) advertsTrainer = Trainer(tokenizer) for one_dataset_item in prepared_dataset: advertsTrainer.train(one_dataset_item["text"], one_dataset_item["category"]) adverts_classifier = Classifier(advertsTrainer.data, tokenizer) # Usage # classification = adverts_classifier.classify("рассылка") # category_chance = classification[0][1] # print(category_chance)
def classificationNB(index): ''' Train the Naive Bayes classifier and classify data naiveBayesClassifier is used. https://github.com/muatik/naive-bayes-classifier ''' # Initial training set from file trainset = [] f = open('E:\\databases\\trainset.txt', 'r') for line in f: if len(line.strip()) == 0: continue line = line.strip().split() assert len(line) == 22 trainset.append({ 'text': '%08d' % int(line[(index + 1) * 2]), 'category': line[(index + 1) * 2 + 1] }) pass # for line in f f.close() # Train the classifier trainer = Trainer(tokenizer) for case in trainset: trainer.train(case['text'], case['category']) classifier = Classifier(trainer.data, tokenizer) # Classification for each of the rest sets for i in range(10): if index == i: continue print '%-2d ~ %-2d' % (index, i) # Read cases from the file and classify each case f = open('E:\\databases\\classification%02d.txt' % (i + 1), 'r') results = [] count = 0 for line in f: count += 1 line = line.strip() if len(line) == 0: continue if count == 1: # the first line -- title header = 'CAT%02d' % (index + 1) assert header not in line results.append('%s\t%s' % (line, header)) continue pass # if count == 1 case = line.split() assert len(case) >= 4 clf = classifier.classify(case[2]) results.append('%s\t%s' % (line, clf)) pass # for line in f f.close() # Save the results back to the file f = open('E:\\databases\\classification%02d.txt' % (i + 1), 'w') for re in results: f.write('%s\n' % re) f.close() pass # for i in range(10)
def train(self): with open('src/URL.csv', 'r') as csv_file: reader = csv_file.readlines() for line in reader: read_dict = {} line_split = line.split(',') if len(line_split) < 2 or len(line_split[0]) == 0: continue read_dict['text'] = line_split[0].strip() read_dict['class'] = line_split[1].strip() self.training_data.append(read_dict) #print training_data for data in self.training_data: self.newsTrainer.train(data['text'], data['class']) self.newsClassifier = Classifier(self.newsTrainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"]))
def classify(filename, size): trainingSet, testingSet = make_balanced_sets.create_sets(filename, size) trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) for sample in trainingSet: trainer.train(sample['url'], sample['result']) classifier = Classifier( trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) mal_mal = 0 mal_clean = 0 clean_clean = 0 clean_mal = 0 for sample in testingSet: predicted = classifier.classify(sample['url'])[0][0] actual = sample['result'] if predicted == 'malicious' and actual == 'malicious': mal_mal += 1 elif predicted == 'malicious' and actual == 'clean': mal_clean += 1 elif predicted == 'clean' and actual == 'clean': clean_clean += 1 elif predicted == 'clean' and actual == 'malicious': clean_mal += 1 size = float(size) mal_mal = float(mal_mal) / size mal_clean = float(mal_clean) / size clean_mal = float(clean_mal) / size clean_clean = float(clean_clean) / size confusionMatrix = [[mal_mal, clean_mal], [mal_clean, clean_clean]] pprint(confusionMatrix) print "Accuracy: " + str(mal_mal + clean_clean) print "False positives (predicted clean when malicious): " + str(clean_mal) print "False negatives (predicted malicious when clean): " + str(mal_clean)
def get_classer(): newsTrainer = Trainer(tokenizer) for news in newsSet: newsTrainer.train(news['text'], news['category']) # When you have sufficient trained data, you are almost done and can start to use # a classifier. newsClassifier = Classifier(newsTrainer.data, tokenizer) return newsClassifier
def neyronka(self, _str): newsTrainer = Trainer(tokenizer) with open('o', 'rt', encoding='utf8') as csvfile: res = '[' for i in csvfile.readlines(): if i == '\n': continue else: theme, text = i.split('***') res += '{\'text\':' + '\'' + text.strip() + '\'' + ', ' + '\'category\':' + '\'' + str( theme) + '\'},\n' res += ']' newsSet = eval(res) for news in newsSet: newsTrainer.train(news['text'], news['category']) newsClassifier = Classifier(newsTrainer.data, tokenizer) unknownInstance = _str classification = newsClassifier.classify(unknownInstance) return (sorted(classification, key=(lambda x: -x[1])))
def train_classifier(newsData_train): data_process = Data_process() for data in newsData_train: data_process.final_process(data['text'], data['category']) newsClassifier = Classifier(data_process, data_process.tokenizer) return newsClassifier
def tweet_classification(unknownInstance): newsTrainer = Trainer(tokenizer) with open("train.txt") as f: for line in f: str = line str = str.split(' ', 1 ); newsTrainer.train(str[1], str[0]) newsClassifier = Classifier(newsTrainer.data, tokenizer) # Now you have a classifier which can give a try to classifiy text of news whose # category is unknown, yet. classification = newsClassifier.classify(unknownInstance) # the classification variable holds the possible categories sorted by # their probablity value ans = dict() for i in range(3): if(classification[0][1]!=0.0): ans[classification[i][0]] = classification[i][1] / classification[0][1]; #print classification #print ans return ans
def init(cls, lang='tr', namesCollection=NamesCollection, classifier=None): cls.lang = lang cls.namesCollection = namesCollection if classifier: cls.classifier = classifier else: cls.classifier = Classifier(CachedModel.get(lang), tokenizer) cls.initialized = True
class NaiveBayesClassifier: def __init__(self): jieba.set_dictionary('dict.big.txt') self.articleTrainer = Trainer(tokenizer) def train(self): # Training articles = article.create_articles_from_file("data/HatePoliticsdata.json") p_train = articles[0:3001] p_test = articles[3001:3031] for a in p_train: doc = a.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) self.articleTrainer.train(doc, 'politics') articles = article.create_articles_from_file("data/Gossipingdata.json") g_train = articles[0:3000] g_test = articles[3001:3301] for a in g_train: doc = a.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) self.articleTrainer.train(doc, 'gossiping') f = open('data/docCountOfClasses.json', 'w', -1, 'utf-8') f.write(json.dumps(self.articleTrainer.data.docCountOfClasses)) f.close() f = open('data/frequencies.json', 'w', -1, 'utf-8') f.write(json.dumps(self.articleTrainer.data.frequencies)) f.close() def classify(self, article): self.data = TrainedData() f = open('data/docCountOfClasses.json', 'r', -1, 'utf-8') self.data.docCountOfClasses = json.load(f) f.close() f = open('data/frequencies.json', 'r', -1, 'utf-8') self.data.frequencies = json.load(f) f.close() #Testing self.articleClassifier = Classifier(self.data, tokenizer) doc = article.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) classification = self.articleClassifier.classify(doc) return classification[0][0]
def determine(sentence): newsTrainer = Trainer(tokenizer) newsSet = [] with open('data.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: newsSet.append({'fact': row['Fact'], 'decision': row['Decision']}) for news in newsSet: newsTrainer.train(news['fact'], news['decision']) newsClassifier = Classifier(newsTrainer.data, tokenizer) classification = newsClassifier.classify(sentence) # False false = classification[0][1] false = str(false).split('.')[0] # True true = classification[1][1] true = str(true).split('.')[0] data = [true, false] return data
def post_logfile(): if request.method == 'GET': log_file = request.args['symptom'] print(log_file) diseaseclassifier = Trainer(tokenizer) #STARTS CLASIFIERS with open("Dataset.csv", "r") as file: #OPENS DATASET for i in file: #FOR EACH LINE lines = file.next().split(",") #PARSE CSV <DISEASE> <SYMPTOM> diseaseclassifier.train(lines[1], lines[0]) #TRAINING diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer) classification = diseaseclassifier.classify(log_file) #CLASIFY INPUT print classification return json.dumps(dict(classification))
def update(self, text, category): """Update training data with new examples. Adds new data to the trainer then generates a new classifier. Can be useful for updating on the fly if performing an interactive data import. Parameters: text (str): New text to classify. category (str): Classification of `text`. """ self._trainer.train(text, category) self._classifier = BayesClassifier( self._trainer.data, self._tknizer )
def create_naive_bayes_classifier(training_examples, training_annotations): print("creating naive bayes classifier") annotations = [categories[x] for x in training_annotations] news_trainer = Trainer( tokenizer.Tokenizer(stop_words=[], signs_to_remove=[tokenization_string])) for example, annotation in zip(training_examples, annotations): news_trainer.train(example, annotation) classifier = Classifier( news_trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[tokenization_string])) print("\t->done") return classifier
def classify(self, article): self.data = TrainedData() f = open('data/docCountOfClasses.json', 'r', -1, 'utf-8') self.data.docCountOfClasses = json.load(f) f.close() f = open('data/frequencies.json', 'r', -1, 'utf-8') self.data.frequencies = json.load(f) f.close() #Testing self.articleClassifier = Classifier(self.data, tokenizer) doc = article.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) classification = self.articleClassifier.classify(doc) return classification[0][0]
def post_logfile(): if request.method == 'POST': log_file = request.args['symptom'] print(log_file) diseaseclassifier = Trainer(tokenizer) #STARTS CLASIFIERS with open("Dataset.csv", "r") as file: #OPENS DATASET for i in file: #FOR EACH LINE lines = file.next().split(",") #PARSE CSV <DISEASE> <SYMPTOM> diseaseclassifier.train(lines[1], lines[0]) #TRAINING diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer) classification = diseaseclassifier.classify(log_file) #CLASIFY INPUT print classification result = [] for item in classification: obj = CustomType(item[0], item[1]) result.append(json.loads(obj.toJSON())) # return json.dumps(OrderedDict(classification)) return json.dumps(result, indent=4)
def create_nbc_nb_classifier(training_dataset): training_examples, training_annotations = training_dataset # training_annotations = [int(not bool(annotation)) for annotation in training_annotations] parsed_training_examples = [ set(tokenize(example)) for example in training_examples ] tr = Trainer( tokenizer.Tokenizer(stop_words=[], signs_to_remove=[tokenization_string])) for example, annotation in zip(parsed_training_examples, training_annotations): tr.train(example, annotation) print("number of tokens seen: %s" % len(tr.data.frequencies.keys())) return tr, Classifier( tr.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[tokenization_string]))
class DomainModel: data_interface = [] newsTrainer = Trainer( tokenizer.Tokenizer(stop_words=[], signs_to_remove=["?!#%&"])) newClassifier = None def __init__(self, data_interface): """ Constructor: Store data interface on creation, Don't train yet, let parent decide when """ if not isinstance(data_interface, Data): raise ValueError( "Data is not properly interfaced through class Data") self.data_interface = data_interface def train(self): """Train on base and FB data""" # Run through each training example in data interface and # feed them into model for data_point in self.data_interface.arr: data_class = data_point[2].strip() # Class is "Credibility" data_text = data_point[4].strip() # Text is "Content URL" self.newsTrainer.train(data_text, data_class) self.newsClassifier = Classifier(self.newsTrainer.data, \ tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) def classify(self, unknownInstance): classification = self.newsClassifier.classify(unknownInstance) return classification
documentTrainer = Trainer(tokenizer) documentSet = [] def getTextBasedOnDocumentID(documentID): ID = int(documentID.split('_')[1]) line = linecache.getline('../2.document_set/document_set.csv', ID + 2) text = line.split(',"')[1] return text for i in range(0, len(traincsv)): documentSet.append({ 'text': getTextBasedOnDocumentID(traincsv[i][0]), 'category': traincsv[i][1] }) for documents in documentSet: documentTrainer.train(documents['text'], documents['category']) newsClassifier = Classifier(documentTrainer.data, tokenizer) for i in range(0, len(testcsv)): data = getTextBasedOnDocumentID(testcsv[i][0]) classification = newsClassifier.classify(data) testcsv[i][1] = int(classification[0][0]) df = pd.DataFrame(testcsv) df.to_csv("../5.evaluation_file/predicted_cat.csv", index=False) #np.savetxt("./5.evaluation_file/predicted_cat.csv", testcsv,header="document_id,category" ,delimiter=",")
import json, os, sys, re from naiveBayesClassifier import tokenizer from naiveBayesClassifier.trainer import Trainer from naiveBayesClassifier.classifier import Classifier ##IMPORTS ''' Usage: python GuessDisease.py "symptomA symptomB symptomC" Example INPUT: python GuessDisease.py "agitation exhaustion vomit" Example OUTPUT: { "disease": "influenza" } ''' ##SETTING UP diseaseclassifier = Trainer(tokenizer) #STARTS CLASIFIERS with open("Dataset.csv", "r") as file: #OPENS DATASET for i in file: #FOR EACH LINE lines = file.next().split(",") #PARSE CSV <DISEASE> <SYMPTOM> diseaseclassifier.train(lines[1], lines[0]) #TRAINING diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer) classification = diseaseclassifier.classify(sys.argv[1]) #CLASIFY INPUT print classification[0] #PRINT CLASIFICATION
tosTrainer = Trainer(tokenizer) def get_corp(read_file): with open(read_file,"r") as r: corpus = [] for line in r: tabsep = line.decode('utf-8').strip().split('\t') a = {} a['text'] = tabsep[0] a['rating'] = tabsep[1] corpus.append(a) return corpus # get the corpus from a training set - using copyright clauses here as an example (a subset of the csv generated by the getpointsdata.py script) tosSet = get_corp("tosdr.org/copyrighttrainset.txt") # You need to train the system passing each text one by one to the trainer module. for corpi in tosSet: tosTrainer.train(corpi['text'], corpi['rating']) # When you have sufficient trained data, you are almost done and can start to use a classifier. tosClassifier = Classifier(tosTrainer.data, tokenizer) # Now you have a classifier which can give a try to classifiy text of policy clauses whose rating is unknown, yet. Example here drawn from test set unknownInstance = "You are free to choose your own copyright license for your content in your account settings: Public Domain Creative Commons non commercial or free licenses but also classic copyright if you wish so." classification = tosClassifier.classify(unknownInstance) # the classification variable holds the possible categories sorted by their probablity value print classification
from naiveBayesClassifier.classifier import Classifier sentimentTrainer = Trainer(tokenizer) # Get the training dataset. with open('training.csv', 'r') as f: data = f.read() trainset = data.splitlines() for line in trainset: pos1 = line.find(',"') pos2 = line.find('",', pos1) if pos1 == -1: pos1 = line.find(',') pos2 = line.find(',', pos1 + 1) comment = line[pos1 + 1:pos2] sentiment = line[pos2 + 1:] else: comment = line[pos1 + 2:pos2 - 2] sentiment = line[pos2 + 2:] sentimentTrainer.train(comment, sentiment) # Use the classifier. sentimentClassifier = Classifier(sentimentTrainer.data, tokenizer) # Classify an unknown review. unknownInstance = "I don't like the app. It crashes everytime." classification = sentimentClassifier.classify(unknownInstance) print classification
You want to train a system with this pre-categorized/pre-classified texts. So, you have better call this data your training set. """ from naiveBayesClassifier import tokenizer from naiveBayesClassifier.trainer import Trainer from naiveBayesClassifier.classifier import Classifier newsTrainer = Trainer(tokenizer) # You need to train the system passing each text one by one to the trainer module. newsSet =[ {'text': 'not to eat too much is not enough to lose weight', 'category': 'health'}, {'text': 'Russia try to invade Ukraine', 'category': 'politics'}, {'text': 'do not neglect exercise', 'category': 'health'}, {'text': 'Syria is the main issue, Obama says', 'category': 'politics'}, {'text': 'eat to lose weight', 'category': 'health'}, {'text': 'you should not eat much', 'category': 'health'} ] for news in newsSet: newsTrainer.train(news['text'], news['category']) # When you have sufficient trained data, you are almost done and can start to use # a classifier. newsClassifier = Classifier(newsTrainer.data, tokenizer) # Now you have a classifier which can give a try to classifiy text of news whose # category is unknown, yet. classification = newsClassifier.classify("Obama is") # the classification variable holds the detected categories sorted print(classification)
doc = " ".join(seg_list) articleTrainer.train(doc, 'politics') articles = article.create_articles_from_file("data/Gossipingdata.json") g_train = articles[0:3000] g_test = articles[3001:3301] for a in g_train: doc = a.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) articleTrainer.train(doc, 'gossiping') #Testing articleClassifier = Classifier(articleTrainer.data, tokenizer) p_gossiping = 0 p_politics = 0 g_gossiping = 0 g_politics = 0 for a in p_test: doc = a.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) classification = articleClassifier.classify(doc) if classification[0][0] == 'gossiping': p_gossiping += 1 else: p_politics += 1
def classify(input): twitter = Twitter() f = open("data.txt", "r") data = json.loads(f.read()) gradeTrainer = Trainer(tokenizer) loadTrainer = Trainer(tokenizer) lectureTrainer = Trainer(tokenizer) print("Training grade ...") for subject in data: if subject["grade"] != "?": review = subject["comment"].replace('.', '\n').split("\n") for li in review: if len(li.strip()) != 0: gradeTrainer.train(li, subject["grade"]) print("Training load ...") for subject in data: if subject["load"] != "?": review = subject["comment"].replace('.', '\n').split("\n") for li in review: if len(li.strip()) != 0: loadTrainer.train(li, subject["load"]) print("Training lecture ...") for subject in data: if subject["lecture"] != "?": review = subject["comment"].replace('.', '\n').split("\n") for li in review: if len(li.strip()) != 0: lectureTrainer.train(li, subject["lecture"]) gradeClassifier = Classifier(gradeTrainer.data, tokenizer) loadClassifier = Classifier(loadTrainer.data, tokenizer) lectureClassifier = Classifier(lectureTrainer.data, tokenizer) input = u"" + input classify_input = [] for element in twitter.pos(input): if element[1] == 'Noun': classify_input.append(element[0]) elif element[1] == 'Verb': classify_input.append(element[0]) elif element[1] == 'Adjective': classify_input.append(element[0]) elif element[1] == 'Adverb': classify_input.append(element[0]) elif element[1] == 'Exclamation': classify_input.append(element[0]) elif element[1] == 'Alpha': classify_input.append(element[0]) elif element[1] == 'KoreanParticle': classify_input.append(element[0]) text = " ".join(classify_input) print(text) gradeClassification = gradeClassifier.classify(text) loadClassification = loadClassifier.classify(text) lectureClassification = lectureClassifier.classify(text) print( "\n________________________________________GRADE________________________________________\n" ) print(gradeClassification) print( "\n________________________________________LOAD_________________________________________\n" ) print(loadClassification) print( "\n________________________________________LECTURE______________________________________\n" ) print(lectureClassification) return gradeClassification, loadClassification, lectureClassification
def get(self): try: print " " print "TestClassifier start" print " " # pasar los stop words a lista desde el file with open("stop_words.txt", "r") as ins: array = [] for line in ins: array.append((line.rstrip('\n')).decode('unicode-escape')) #print array newsTrainer = Trainer( tokenizer.Tokenizer(stop_words=array, signs_to_remove=["?!#%&_"])) hoy = date.today() query = News3.query(News3.date == hoy, News3.news_from.IN([ "uy_press", ]), News3.category == "Política") # You need to train the system passing each text one by one to the trainer module. #newsSet =[ # {'text': 'not to eat too much is not enough to lose weight', 'category': 'health'}, # {'text': 'Russia try to invade Ukraine', 'category': 'politics'}, # {'text': 'do not neglect exercise', 'category': 'health'}, # {'text': 'Syria is the main issue, Obama says', 'category': 'politics'}, # {'text': 'eat to lose weight', 'category': 'health'}, # {'text': 'you should not eat much', 'category': 'health'} #] query2 = News3.query(News3.date == hoy, News3.news_from == "uy_press", News3.category == "deportes") query4 = News3.query(News3.date == hoy, News3.news_from == "uy_press", News3.category == "salud") #for news in newsSet: # newsTrainer.train(news['text'], news['category']) c = 0 #print query for i in query: print " " print i.category newsTrainer.train(i.html, 'politica') #if c == 10: break c += 1 #for i in query2: # newsTrainer.train(i.html, 'deportes') #raise Exception('I know Python!') #for i in query4: # newsTrainer.train(i.html, 'salud') # When you have sufficient trained data, you are almost done and can start to use # a classifier. # Now you have a classifier which can give a try to classifiy text of news whose # category is unknown, yet. query3 = News3.query( News3.date == hoy, News3.news_from.IN([ "el_pais", ]), News3.id.IN([0]), ) ### newsClassifier = Classifier( newsTrainer.data, tokenizer.Tokenizer(stop_words=array, signs_to_remove=["?!#%&"])) #print unknownInstance classification = newsClassifier.classify( "Vidalín: No quiero que me llamen para saber qué tramite hay que hacer para poner un prostíbulo" ) # the classification variable holds the detected categories sorted print " classification " print(classification) except: print traceback.format_exc()
def article_keywords(article): keys = Keywords.objects.get(article=article) print keys l = [k.keyword for k in keys.keywords.all()] print " ".join(l) keyset = {'keyword': " ".join(l)} return keyset if __name__ == '__main__': print "Starting testing of Bayes Classifer" labeled_articles = [ (a, a.relevant) for a in Article.objects.all()[:(len(Article.objects.all()))] ] print labeled_articles featuresets = [] for (article, relevant) in labeled_articles: r = article_keywords(article) featuresets.append((r, relevant)) print featuresets train_set, test_set = featuresets[:(len(featuresets))], featuresets[( len(featuresets) - 2):] print train_set newsTrainer = Trainer(tokenizer) for f in train_set: newsTrainer.train(f[0]['keyword'], f[1]) newsClassifier = Classifier(newsTrainer.data, tokenizer) url = raw_input("Enter the url: ") testurl(url, newsClassifier)