def __init__(self, journal=None): """Classifer initialization. Parameters: journal_file (str): Journal file string to import. """ self._tknizer = tokenizer.Tokenizer(signs_to_remove=['?!%.']) self._trainer = Trainer(self._tknizer) if journal is not None: journal_data = train_journal(journal) for group in journal_data: # 0: Allocation account. # 1: List of transactions. # 2: Greatest common multiple of values in transactions. for transaction in group[1]: # 0: Transaction payee string. # 1: Allocation account. self._trainer.train(transaction[0], transaction[1]) self._classifier = BayesClassifier( self._trainer.data, self._tknizer ) else: self._classifier = None
def getKeywords(self, html): text = self.getHtml2Text(html) # print text text = self.zhsJieba(text) #取得registry reg = api.portal.get_registry_record('mingjing.content.browser.mjnetSetting.IMJNetSetting.catDict') trainSet = [] for item in reg: key = item.split('|||')[0] for line in reg[item].split('\n'): zhsString = self.zhsJieba(line) trainSet.append({'category': key, 'text': zhsString}) #用簡單貝氏分類文章 newsTrainer = Trainer(tokenizer) for news in trainSet: newsTrainer.train(news['text'].encode('utf-8'), news['category']) newsClassifier = Classifier(newsTrainer.data, tokenizer) classification = newsClassifier.classify(text) print classification # import pdb; pdb.set_trace() if classification[0][1] == 0.0: classification.insert(0, (u'n99', 0.0)) result = [] for item in classification: result.append(item[0]) return result
def __call__(self, text): context = self.context request = self.request response = request.response catalog = context.portal_catalog bayesFilter = api.portal.get_registry_record( 'i8d.content.browser.coverSetting.ICoverSetting.bayesFilter') trainingSet = [] for line in bayesFilter.split('\n'): trainingSet.append({ 'category': 'hasKey', 'text': safe_unicode(line) }) trainer = Trainer(tokenizer) for record in trainingSet: trainer.train(record['text'], record['category']) classifier = Classifier(trainer.data, tokenizer) result = classifier.classify(safe_unicode(text)) import pdb pdb.set_trace()
def train_spam_texts(): # Reading dataset file dataset_lang = "ru" dataset_file = codecs.open(os.path.abspath(os.curdir) + "/data/assets/spam_texts.json", "r", "utf_8_sig") dataset_data = json.load(dataset_file) # Preparing adverts spam dataset prepared_dataset = [] for idx, item in enumerate(dataset_data[dataset_lang]["adverts"]): prepared_dataset.append({ "text": item["text"], "category": "adverts" }) # Training # (Will be replaced by another library soon) advertsTrainer = Trainer(tokenizer) for one_dataset_item in prepared_dataset: advertsTrainer.train(one_dataset_item["text"], one_dataset_item["category"]) adverts_classifier = Classifier(advertsTrainer.data, tokenizer) # Usage # classification = adverts_classifier.classify("рассылка") # category_chance = classification[0][1] # print(category_chance)
def classify(filename, size): trainingSet, testingSet = make_chronological_sets.create_sets( filename, size) trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) for sample in trainingSet: trainer.train(sample['url'], sample['result']) classifier = Classifier( trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) mal_mal = 0 mal_clean = 0 clean_clean = 0 clean_mal = 0 for sample in testingSet: predicted = classifier.classify(sample['url'])[0][0] actual = sample['result'] if predicted == 'malicious' and actual == 'malicious': mal_mal += 1 elif predicted == 'malicious' and actual == 'clean': mal_clean += 1 elif predicted == 'clean' and actual == 'clean': clean_clean += 1 elif predicted == 'clean' and actual == 'malicious': clean_mal += 1 prop_caught = float(mal_mal) / float(mal_mal + clean_mal) prop_missed = float(clean_mal) / float(mal_mal + clean_mal) ## Stuff to get proportions: # size = float(size) # mal_mal = float(mal_mal)/size # mal_clean = float(mal_clean)/size # clean_mal = float(clean_mal)/size # clean_clean = float(clean_clean)/size ## Confusion matrix stuff: # confusionMatrix = [['Actually malicious', mal_mal, clean_mal], ['Actually clean', mal_clean, clean_clean]] # print tabulate(confusionMatrix, headers=['', 'Predicted malicious', 'Predicted clean']) print "Total: " + str(mal_mal + mal_clean + clean_mal + clean_clean) print "Malware: " + str(mal_mal + clean_mal) print "Clean: " + str(mal_clean + clean_clean) print "Caught: " + str(mal_mal) + " (" + "{:.1%}".format(prop_caught) + ")" print "Missed: " + str(clean_mal) + " (" + "{:.1%}".format( prop_missed) + ")"
def get_classer(): newsTrainer = Trainer(tokenizer) for news in newsSet: newsTrainer.train(news['text'], news['category']) # When you have sufficient trained data, you are almost done and can start to use # a classifier. newsClassifier = Classifier(newsTrainer.data, tokenizer) return newsClassifier
class NaiveBayesClassifier: def __init__(self): jieba.set_dictionary('dict.big.txt') self.articleTrainer = Trainer(tokenizer) def train(self): # Training articles = article.create_articles_from_file("data/HatePoliticsdata.json") p_train = articles[0:3001] p_test = articles[3001:3031] for a in p_train: doc = a.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) self.articleTrainer.train(doc, 'politics') articles = article.create_articles_from_file("data/Gossipingdata.json") g_train = articles[0:3000] g_test = articles[3001:3301] for a in g_train: doc = a.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) self.articleTrainer.train(doc, 'gossiping') f = open('data/docCountOfClasses.json', 'w', -1, 'utf-8') f.write(json.dumps(self.articleTrainer.data.docCountOfClasses)) f.close() f = open('data/frequencies.json', 'w', -1, 'utf-8') f.write(json.dumps(self.articleTrainer.data.frequencies)) f.close() def classify(self, article): self.data = TrainedData() f = open('data/docCountOfClasses.json', 'r', -1, 'utf-8') self.data.docCountOfClasses = json.load(f) f.close() f = open('data/frequencies.json', 'r', -1, 'utf-8') self.data.frequencies = json.load(f) f.close() #Testing self.articleClassifier = Classifier(self.data, tokenizer) doc = article.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) classification = self.articleClassifier.classify(doc) return classification[0][0]
def classify(filename, size, url, result): trainingSet = make_training_set.create_set(filename, size) trainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = [""])) for sample in trainingSet: trainer.train(sample['url'], sample['result']) classifier = Classifier(trainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = [""])) print "Expected: " + result print classifier.classify(url)
def generate(mongourl, database, collection, lang): c = MongoClient(mongourl) tweets = c[database][collection].find() trainer = Trainer(tokenizer) for tweet in tweets: trainer.train(tweet['tweet'], tweet['gender']) modelFileName = 'model_{}.txt'.format(lang) with open(modelFileName, 'wb') as modelFile: cPickle.dump(trainer.data, modelFile, cPickle.HIGHEST_PROTOCOL) print('OK : generated trained data has been writen in the file "{}"'. format(modelFileName))
def create_naive_bayes_classifier(training_examples, training_annotations): print("creating naive bayes classifier") annotations = [categories[x] for x in training_annotations] news_trainer = Trainer( tokenizer.Tokenizer(stop_words=[], signs_to_remove=[tokenization_string])) for example, annotation in zip(training_examples, annotations): news_trainer.train(example, annotation) classifier = Classifier( news_trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[tokenization_string])) print("\t->done") return classifier
class DomainModel: training_data = [] newsTrainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) newClassifier = None def __init__(self): self.train() # TODO: Train on FB data too def train(self): with open('src/URL.csv', 'r') as csv_file: reader = csv_file.readlines() for line in reader: read_dict = {} line_split = line.split(',') if len(line_split) < 2 or len(line_split[0]) == 0: continue read_dict['text'] = line_split[0].strip() read_dict['class'] = line_split[1].strip() self.training_data.append(read_dict) #print training_data for data in self.training_data: self.newsTrainer.train(data['text'], data['class']) self.newsClassifier = Classifier(self.newsTrainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) def classify(self, unknownInstance): classification = self.newsClassifier.classify(unknownInstance) return classification
def classificationNB(index): ''' Train the Naive Bayes classifier and classify data naiveBayesClassifier is used. https://github.com/muatik/naive-bayes-classifier ''' # Initial training set from file trainset = [] f = open('E:\\databases\\trainset.txt', 'r') for line in f: if len(line.strip()) == 0: continue line = line.strip().split() assert len(line) == 22 trainset.append({ 'text': '%08d' % int(line[(index + 1) * 2]), 'category': line[(index + 1) * 2 + 1] }) pass # for line in f f.close() # Train the classifier trainer = Trainer(tokenizer) for case in trainset: trainer.train(case['text'], case['category']) classifier = Classifier(trainer.data, tokenizer) # Classification for each of the rest sets for i in range(10): if index == i: continue print '%-2d ~ %-2d' % (index, i) # Read cases from the file and classify each case f = open('E:\\databases\\classification%02d.txt' % (i + 1), 'r') results = [] count = 0 for line in f: count += 1 line = line.strip() if len(line) == 0: continue if count == 1: # the first line -- title header = 'CAT%02d' % (index + 1) assert header not in line results.append('%s\t%s' % (line, header)) continue pass # if count == 1 case = line.split() assert len(case) >= 4 clf = classifier.classify(case[2]) results.append('%s\t%s' % (line, clf)) pass # for line in f f.close() # Save the results back to the file f = open('E:\\databases\\classification%02d.txt' % (i + 1), 'w') for re in results: f.write('%s\n' % re) f.close() pass # for i in range(10)
def classify(filename, size): trainingSet, testingSet = make_balanced_sets.create_sets(filename, size) trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) for sample in trainingSet: trainer.train(sample['url'], sample['result']) classifier = Classifier( trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) mal_mal = 0 mal_clean = 0 clean_clean = 0 clean_mal = 0 for sample in testingSet: predicted = classifier.classify(sample['url'])[0][0] actual = sample['result'] if predicted == 'malicious' and actual == 'malicious': mal_mal += 1 elif predicted == 'malicious' and actual == 'clean': mal_clean += 1 elif predicted == 'clean' and actual == 'clean': clean_clean += 1 elif predicted == 'clean' and actual == 'malicious': clean_mal += 1 size = float(size) mal_mal = float(mal_mal) / size mal_clean = float(mal_clean) / size clean_mal = float(clean_mal) / size clean_clean = float(clean_clean) / size confusionMatrix = [[mal_mal, clean_mal], [mal_clean, clean_clean]] pprint(confusionMatrix) print "Accuracy: " + str(mal_mal + clean_clean) print "False positives (predicted clean when malicious): " + str(clean_mal) print "False negatives (predicted malicious when clean): " + str(mal_clean)
def neyronka(self, _str): newsTrainer = Trainer(tokenizer) with open('o', 'rt', encoding='utf8') as csvfile: res = '[' for i in csvfile.readlines(): if i == '\n': continue else: theme, text = i.split('***') res += '{\'text\':' + '\'' + text.strip() + '\'' + ', ' + '\'category\':' + '\'' + str( theme) + '\'},\n' res += ']' newsSet = eval(res) for news in newsSet: newsTrainer.train(news['text'], news['category']) newsClassifier = Classifier(newsTrainer.data, tokenizer) unknownInstance = _str classification = newsClassifier.classify(unknownInstance) return (sorted(classification, key=(lambda x: -x[1])))
def create_nbc_nb_classifier(training_dataset): training_examples, training_annotations = training_dataset # training_annotations = [int(not bool(annotation)) for annotation in training_annotations] parsed_training_examples = [ set(tokenize(example)) for example in training_examples ] tr = Trainer( tokenizer.Tokenizer(stop_words=[], signs_to_remove=[tokenization_string])) for example, annotation in zip(parsed_training_examples, training_annotations): tr.train(example, annotation) print("number of tokens seen: %s" % len(tr.data.frequencies.keys())) return tr, Classifier( tr.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[tokenization_string]))
def tweet_classification(unknownInstance): newsTrainer = Trainer(tokenizer) with open("train.txt") as f: for line in f: str = line str = str.split(' ', 1 ); newsTrainer.train(str[1], str[0]) newsClassifier = Classifier(newsTrainer.data, tokenizer) # Now you have a classifier which can give a try to classifiy text of news whose # category is unknown, yet. classification = newsClassifier.classify(unknownInstance) # the classification variable holds the possible categories sorted by # their probablity value ans = dict() for i in range(3): if(classification[0][1]!=0.0): ans[classification[i][0]] = classification[i][1] / classification[0][1]; #print classification #print ans return ans
def determine(sentence): newsTrainer = Trainer(tokenizer) newsSet = [] with open('data.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: newsSet.append({'fact': row['Fact'], 'decision': row['Decision']}) for news in newsSet: newsTrainer.train(news['fact'], news['decision']) newsClassifier = Classifier(newsTrainer.data, tokenizer) classification = newsClassifier.classify(sentence) # False false = classification[0][1] false = str(false).split('.')[0] # True true = classification[1][1] true = str(true).split('.')[0] data = [true, false] return data
class NaiveBayesClassifier: def __init__(self): jieba.set_dictionary('dict.big.txt') self.articleTrainer = Trainer(tokenizer) def train(self): # Training articles = article.create_articles_from_file("data/HatePoliticsdata.json") p_train = articles[0:3001] p_test = articles[3001:3031] for a in p_train: doc = a.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) self.articleTrainer.train(doc, 'politics') articles = article.create_articles_from_file("data/Gossipingdata.json") g_train = articles[0:3000] g_test = articles[3001:3301] for a in g_train: doc = a.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) self.articleTrainer.train(doc, 'gossiping') def classify(self, article): #Testing self.articleClassifier = Classifier(articleTrainer.data, tokenizer) doc = a.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) classification = articleClassifier.classify(doc) return classification[0][0]
def post_logfile(): if request.method == 'GET': log_file = request.args['symptom'] print(log_file) diseaseclassifier = Trainer(tokenizer) #STARTS CLASIFIERS with open("Dataset.csv", "r") as file: #OPENS DATASET for i in file: #FOR EACH LINE lines = file.next().split(",") #PARSE CSV <DISEASE> <SYMPTOM> diseaseclassifier.train(lines[1], lines[0]) #TRAINING diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer) classification = diseaseclassifier.classify(log_file) #CLASIFY INPUT print classification return json.dumps(dict(classification))
def main(): testTrainer = Trainer( tokenizer.Tokenizer(stop_words=[], signs_to_remove=["?!#%&"])) mark_unlabel(unlabeled_csv_link, manual_labeled_link) combine_labeled_data(manual_labeled_link, positive_labeled_link, combined_PU_link) training_data = np.load(combined_PU_link) unlabeled_data = np.load(unlabeled_npy_link) print(unlabeled_data) labels = training_data[:, 0] features = training_data[:, 1:] training_data, test_data = splitDataset(training_data, 0.6) summaries = summarizeByClass(training_data) print(summaries) prediction = getPredictions(summaries, test_data) print(prediction) accuracy = getAccuracy(test_data, prediction) print('Accuracy: {}'.format(accuracy))
def post_logfile(): if request.method == 'POST': log_file = request.args['symptom'] print(log_file) diseaseclassifier = Trainer(tokenizer) #STARTS CLASIFIERS with open("Dataset.csv", "r") as file: #OPENS DATASET for i in file: #FOR EACH LINE lines = file.next().split(",") #PARSE CSV <DISEASE> <SYMPTOM> diseaseclassifier.train(lines[1], lines[0]) #TRAINING diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer) classification = diseaseclassifier.classify(log_file) #CLASIFY INPUT print classification result = [] for item in classification: obj = CustomType(item[0], item[1]) result.append(json.loads(obj.toJSON())) # return json.dumps(OrderedDict(classification)) return json.dumps(result, indent=4)
class DomainModel: data_interface = [] newsTrainer = Trainer( tokenizer.Tokenizer(stop_words=[], signs_to_remove=["?!#%&"])) newClassifier = None def __init__(self, data_interface): """ Constructor: Store data interface on creation, Don't train yet, let parent decide when """ if not isinstance(data_interface, Data): raise ValueError( "Data is not properly interfaced through class Data") self.data_interface = data_interface def train(self): """Train on base and FB data""" # Run through each training example in data interface and # feed them into model for data_point in self.data_interface.arr: data_class = data_point[2].strip() # Class is "Credibility" data_text = data_point[4].strip() # Text is "Content URL" self.newsTrainer.train(data_text, data_class) self.newsClassifier = Classifier(self.newsTrainer.data, \ tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) def classify(self, unknownInstance): classification = self.newsClassifier.classify(unknownInstance) return classification
import linecache import pandas as pd traincsv = np.loadtxt('../3.training_data/Training_Data.csv', skiprows=1, delimiter=",", dtype={ 'names': ('documentID', 'category'), 'formats': ('S30', 'i4') }) testcsv = np.genfromtxt('../4.test_data/Test_Data.csv', skip_header=1, delimiter=",", dtype=[('documentID', 'S30'), ('category', 'i4')]) #datacsv = np.genfromtxt ('./2.document_set/document_set.csv',skip_header =1, delimiter=",") documentTrainer = Trainer(tokenizer) documentSet = [] def getTextBasedOnDocumentID(documentID): ID = int(documentID.split('_')[1]) line = linecache.getline('../2.document_set/document_set.csv', ID + 2) text = line.split(',"')[1] return text for i in range(0, len(traincsv)): documentSet.append({ 'text': getTextBasedOnDocumentID(traincsv[i][0]), 'category': traincsv[i][1]
def __init__(self): jieba.set_dictionary('dict.big.txt') self.articleTrainer = Trainer(tokenizer)
['Cabbage Loopers', 'holes on leaves'], [ 'Cutworms', 'fat caterpillars, basically gray, brown, or black with 41 to 51 mm long when fully grown' ], ['Cutworms', 'damaged stem'], [ 'Bacterial Leaf Spot', 'small water-soaked spots on older leaves then quickly turn black' ], ['Bacterial Leaf Spot', 'holes on leaves'], ['Lettuce Drop', 'older leaves wilt'], ['Lettuce Drop', 'older leaves collapse'], ['Lettuce Drop', 'brown crown tissue'], ['Lettuce Drop', 'holes on leaves'], ['Anthracnose', 'water-soaked spots that turn yellow'], [ 'Anthracnose', 'white to pink spore masses of the fungus in the center of the lesions' ], ['Anthracnose', 'damaged leaf becomes papery'], ['Anthracnose', 'holes on leaves'], ['Tipburn', 'browing of leaf margins'], ['Tipburn', 'brown veins'] ] disease_classifier = Trainer(tokenizer) for data in dataset: disease_classifier.train(data[1], data[0]) disease_classifier = Classifier(disease_classifier.data, tokenizer) classifications = disease_classifier.classify(sys.argv[1]) classifications_list = [] for classification in classifications: classifications_list.append(classification[0]) print json.dumps({'classifications': classifications_list})
""" We have the list of comments and their sentiments. This trains the system with this pre-classified texts. """ from naiveBayesClassifier import tokenizer from naiveBayesClassifier.trainer import Trainer from naiveBayesClassifier.classifier import Classifier sentimentTrainer = Trainer(tokenizer) # Get the training dataset. with open('training.csv', 'r') as f: data = f.read() trainset = data.splitlines() for line in trainset: pos1 = line.find(',"') pos2 = line.find('",', pos1) if pos1 == -1: pos1 = line.find(',') pos2 = line.find(',', pos1 + 1) comment = line[pos1 + 1:pos2] sentiment = line[pos2 + 1:] else: comment = line[pos1 + 2:pos2 - 2] sentiment = line[pos2 + 2:] sentimentTrainer.train(comment, sentiment) # Use the classifier. sentimentClassifier = Classifier(sentimentTrainer.data, tokenizer)
class DomainModel: training_data = [] newsTrainer = Trainer( tokenizer.Tokenizer(stop_words=[], signs_to_remove=["?!#%&"])) newClassifier = None def __init__(self): self.train() def train(self): """Train on base and FB data""" with open('res/data/base_data.csv', 'r') as csv_file: reader = csv.reader(csv_file) i = 0 for line in reader: i += 1 line_split = line read_dict = {} if i == 1 or len(line_split) <= 2 or len(line_split[0]) == 0: continue read_dict['class'] = line_split[2].strip() # Accounting for our inconsistency in Spreadsheet if read_dict["class"] == "Real": read_dict['text'] = line_split[6].strip() else: read_dict['text'] = line_split[5].strip() print(read_dict) self.training_data.append(read_dict) print('---->>>>>><<<<<<<-------') with open('res/data/fb_data.csv', 'r') as csv_file: reader = csv.reader(csv_file) i = 0 for line in reader: i += 1 line_split = line read_dict = {} if i == 1 or len(line_split) <= 2: continue read_dict['class'] = line_split[2].strip() read_dict['text'] = line_split[5].strip() print(read_dict) self.training_data.append(read_dict) #print training_data for data in self.training_data: self.newsTrainer.train(data['text'], data['class']) self.newsClassifier = Classifier( self.newsTrainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=["?!#%&"])) def classify(self, unknownInstance): classification = self.newsClassifier.classify(unknownInstance) return classification
from naiveBayesClassifier import tokenizer from naiveBayesClassifier.trainer import Trainer from naiveBayesClassifier.classifier import Classifier tosTrainer = Trainer(tokenizer) def get_corp(read_file): with open(read_file,"r") as r: corpus = [] for line in r: tabsep = line.decode('utf-8').strip().split('\t') a = {} a['text'] = tabsep[0] a['rating'] = tabsep[1] corpus.append(a) return corpus # get the corpus from a training set - using copyright clauses here as an example (a subset of the csv generated by the getpointsdata.py script) tosSet = get_corp("tosdr.org/copyrighttrainset.txt") # You need to train the system passing each text one by one to the trainer module. for corpi in tosSet: tosTrainer.train(corpi['text'], corpi['rating']) # When you have sufficient trained data, you are almost done and can start to use a classifier. tosClassifier = Classifier(tosTrainer.data, tokenizer) # Now you have a classifier which can give a try to classifiy text of policy clauses whose rating is unknown, yet. Example here drawn from test set unknownInstance = "You are free to choose your own copyright license for your content in your account settings: Public Domain Creative Commons non commercial or free licenses but also classic copyright if you wish so." classification = tosClassifier.classify(unknownInstance)
def get(self): try: print " " print "TestClassifier start" print " " # pasar los stop words a lista desde el file with open("stop_words.txt", "r") as ins: array = [] for line in ins: array.append((line.rstrip('\n')).decode('unicode-escape')) #print array newsTrainer = Trainer( tokenizer.Tokenizer(stop_words=array, signs_to_remove=["?!#%&_"])) hoy = date.today() query = News3.query(News3.date == hoy, News3.news_from.IN([ "uy_press", ]), News3.category == "Política") # You need to train the system passing each text one by one to the trainer module. #newsSet =[ # {'text': 'not to eat too much is not enough to lose weight', 'category': 'health'}, # {'text': 'Russia try to invade Ukraine', 'category': 'politics'}, # {'text': 'do not neglect exercise', 'category': 'health'}, # {'text': 'Syria is the main issue, Obama says', 'category': 'politics'}, # {'text': 'eat to lose weight', 'category': 'health'}, # {'text': 'you should not eat much', 'category': 'health'} #] query2 = News3.query(News3.date == hoy, News3.news_from == "uy_press", News3.category == "deportes") query4 = News3.query(News3.date == hoy, News3.news_from == "uy_press", News3.category == "salud") #for news in newsSet: # newsTrainer.train(news['text'], news['category']) c = 0 #print query for i in query: print " " print i.category newsTrainer.train(i.html, 'politica') #if c == 10: break c += 1 #for i in query2: # newsTrainer.train(i.html, 'deportes') #raise Exception('I know Python!') #for i in query4: # newsTrainer.train(i.html, 'salud') # When you have sufficient trained data, you are almost done and can start to use # a classifier. # Now you have a classifier which can give a try to classifiy text of news whose # category is unknown, yet. query3 = News3.query( News3.date == hoy, News3.news_from.IN([ "el_pais", ]), News3.id.IN([0]), ) ### newsClassifier = Classifier( newsTrainer.data, tokenizer.Tokenizer(stop_words=array, signs_to_remove=["?!#%&"])) #print unknownInstance classification = newsClassifier.classify( "Vidalín: No quiero que me llamen para saber qué tramite hay que hacer para poner un prostíbulo" ) # the classification variable holds the detected categories sorted print " classification " print(classification) except: print traceback.format_exc()
def classify(input): twitter = Twitter() f = open("data.txt", "r") data = json.loads(f.read()) gradeTrainer = Trainer(tokenizer) loadTrainer = Trainer(tokenizer) lectureTrainer = Trainer(tokenizer) print("Training grade ...") for subject in data: if subject["grade"] != "?": review = subject["comment"].replace('.', '\n').split("\n") for li in review: if len(li.strip()) != 0: gradeTrainer.train(li, subject["grade"]) print("Training load ...") for subject in data: if subject["load"] != "?": review = subject["comment"].replace('.', '\n').split("\n") for li in review: if len(li.strip()) != 0: loadTrainer.train(li, subject["load"]) print("Training lecture ...") for subject in data: if subject["lecture"] != "?": review = subject["comment"].replace('.', '\n').split("\n") for li in review: if len(li.strip()) != 0: lectureTrainer.train(li, subject["lecture"]) gradeClassifier = Classifier(gradeTrainer.data, tokenizer) loadClassifier = Classifier(loadTrainer.data, tokenizer) lectureClassifier = Classifier(lectureTrainer.data, tokenizer) input = u"" + input classify_input = [] for element in twitter.pos(input): if element[1] == 'Noun': classify_input.append(element[0]) elif element[1] == 'Verb': classify_input.append(element[0]) elif element[1] == 'Adjective': classify_input.append(element[0]) elif element[1] == 'Adverb': classify_input.append(element[0]) elif element[1] == 'Exclamation': classify_input.append(element[0]) elif element[1] == 'Alpha': classify_input.append(element[0]) elif element[1] == 'KoreanParticle': classify_input.append(element[0]) text = " ".join(classify_input) print(text) gradeClassification = gradeClassifier.classify(text) loadClassification = loadClassifier.classify(text) lectureClassification = lectureClassifier.classify(text) print( "\n________________________________________GRADE________________________________________\n" ) print(gradeClassification) print( "\n________________________________________LOAD_________________________________________\n" ) print(loadClassification) print( "\n________________________________________LECTURE______________________________________\n" ) print(lectureClassification) return gradeClassification, loadClassification, lectureClassification
def article_keywords(article): keys = Keywords.objects.get(article=article) print keys l = [k.keyword for k in keys.keywords.all()] print " ".join(l) keyset = {'keyword': " ".join(l)} return keyset if __name__ == '__main__': print "Starting testing of Bayes Classifer" labeled_articles = [ (a, a.relevant) for a in Article.objects.all()[:(len(Article.objects.all()))] ] print labeled_articles featuresets = [] for (article, relevant) in labeled_articles: r = article_keywords(article) featuresets.append((r, relevant)) print featuresets train_set, test_set = featuresets[:(len(featuresets))], featuresets[( len(featuresets) - 2):] print train_set newsTrainer = Trainer(tokenizer) for f in train_set: newsTrainer.train(f[0]['keyword'], f[1]) newsClassifier = Classifier(newsTrainer.data, tokenizer) url = raw_input("Enter the url: ") testurl(url, newsClassifier)
def article_keywords(article): keys=Keywords.objects.get(article=article) print keys l=[k.keyword for k in keys.keywords.all()] print " ".join(l) keyset={'keyword':" ".join(l)} return keyset if __name__ == '__main__': print "Starting testing of Bayes Classifer" labeled_articles = [(a, a.relevant) for a in Article.objects.all()[:(len(Article.objects.all()))]] print labeled_articles featuresets=[] for (article, relevant) in labeled_articles: r=article_keywords(article) featuresets.append((r,relevant)) print featuresets train_set, test_set = featuresets[:(len(featuresets))], featuresets[(len(featuresets)-2):] print train_set newsTrainer = Trainer(tokenizer) for f in train_set: newsTrainer.train(f[0]['keyword'],f[1]) newsClassifier = Classifier(newsTrainer.data, tokenizer) url=raw_input("Enter the url: ") testurl(url,newsClassifier)
# import json,os,sys,re from naiveBayesClassifier import tokenizer from naiveBayesClassifier.trainer import Trainer from naiveBayesClassifier.classifier import Classifier newsTrainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) # You need to train the system passing each text one by one to the trainer module. newsSet =[ {'symptoms': 'pain chest', 'disease': 'hypertensive disease'}, {'symptoms': 'shortness of breath', 'disease': 'hypertensive disease'}, {'symptoms': 'dizziness', 'disease': 'hypertensive disease'}, {'symptoms': 'asthenia', 'disease': 'hypertensive disease'}, {'symptoms': 'fall', 'disease': 'hypertensive disease'}, {'symptoms': 'syncope', 'disease': 'hypertensive disease'}, {'symptoms': 'vertigo', 'disease': 'hypertensive disease'}, {'symptoms': 'sweat sweating increased', 'disease': 'hypertensive disease'}, {'symptoms': 'palpitation', 'disease': 'hypertensive disease'}, {'symptoms': 'nausea', 'disease': 'hypertensive disease'}, {'symptoms': 'angina pectoris', 'disease': 'hypertensive disease'}, {'symptoms': 'pressure chest', 'disease': 'hypertensive disease'}, {'symptoms': 'polyuria', 'disease': 'diabetes'}, {'symptoms': 'polydypsia', 'disease': 'diabetes'}, {'symptoms': 'shortness of breath', 'disease': 'diabetes'}, {'symptoms': 'asthenia', 'disease': 'diabetes'}, {'symptoms': 'nausea', 'disease': 'diabetes'}, {'symptoms': 'orthopnea', 'disease': 'diabetes'}, {'symptoms': 'sweat sweating increased', 'disease': 'diabetes'}, {'symptoms': 'unresponsiveness', 'disease': 'diabetes'},
from naiveBayesClassifier import tokenizer from naiveBayesClassifier.trainer import Trainer from naiveBayesClassifier.classifier import Classifier newsTrainer = Trainer(tokenizer) # You need to train the system passing each text one by one to the trainer module. newsSet = [{ 'text': 'sorry sir mobile you are calling is watstaph means call back in some time aur sanderson at by dynasty followed by the number', 'category': 'switchoff' }, { 'text': 'the number you are calling is others which off or not reachable at the moment please try later aapke dwara dial kiya gaya number ya abhi switched off hai ya network kshetra se bahar hai kripya', 'category': 'switchedoff' }, { 'text': 'jis grahak ko aap call kar rahe hain woh is samay available nahi hai ab aap apne karan tariff ke hisaab se voice message chod sakte hain voice message ke liye star ek dabaye the customer you are calling is on', 'category': 'unavailable' }, { 'text': 'her lines to the calls destination appised call again like call ki ek he number ke liye abhi bhi nine hi guest hai kripya dobara call karenge sir lines to the calls destination up', 'category': 'Busy' }, { 'text': 'the number you are calling is either switched off or not reachable at the moment please try later aapke dwara dial kiya gaya number ya abhi switched off hai ya network kshetra se bahar hai kripya', 'category': 'switchedoff' }, { 'text': 'the airtel subscriber you have called is speaking to someone else you can wait', 'category': 'Busy'
Suppose you have some texts of news and know their categories. You want to train a system with this pre-categorized/pre-classified texts. So, you have better call this data your training set. """ # import nltk from dataset import newsSet # from bank.bank import newsSet from naiveBayesClassifier.classifier import Classifier from naiveBayesClassifier.tokenizer import Tokenizer from naiveBayesClassifier.trainer import Trainer token = Tokenizer() newsTrainer = Trainer() # You need to train the system passing each text one by one to the trainer module. # newsSet = [ # {'question': 'Is there a 24 hour Customer Contact Centre?', # 'answer': 'Yes, we have a 24 Hour Customer Contact Centre where you can get support related to your banking enquiries. You can call the numbers: +263 772 244 788, +263...'}, # {'question': 'Is there a 24 hour Customer Contact Centre?', # 'answer': 'Yes, we have a 24 Hour Customer Contact Centre where you can get support related to your banking enquiries. You can call the numbers: +263 772 244 788, +263...'}, # {'question': 'Is there a way I can check my account balance other than contacting the branch?', # 'answer': 'Yes, you can check your balance through our ATM network, NMBMobile App or Internet Banking.'}, # {'question': 'What is an e-Statement?', # 'answer': 'An e-Statement is an electronic version of your paper bank statement which is emailed directly to your registered email address in a password protected PDF ...'}, # {'question': 'How can I transfer money to a bank account abroad?', # 'answer': 'This service is currently available for Corporate clients only, subject to availability of funds and the RBZ priority payments list'}, # {'question': 'How do I get internal funds transfer forms?',
import json, os, sys, re from naiveBayesClassifier import tokenizer from naiveBayesClassifier.trainer import Trainer from naiveBayesClassifier.classifier import Classifier ##IMPORTS ''' Usage: python GuessDisease.py "symptomA symptomB symptomC" Example INPUT: python GuessDisease.py "agitation exhaustion vomit" Example OUTPUT: { "disease": "influenza" } ''' ##SETTING UP diseaseclassifier = Trainer(tokenizer) #STARTS CLASIFIERS with open("Dataset.csv", "r") as file: #OPENS DATASET for i in file: #FOR EACH LINE lines = file.next().split(",") #PARSE CSV <DISEASE> <SYMPTOM> diseaseclassifier.train(lines[1], lines[0]) #TRAINING diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer) classification = diseaseclassifier.classify(sys.argv[1]) #CLASIFY INPUT print classification[0] #PRINT CLASIFICATION
class JiraClassifier: file_loc = "JiraMetaData.xlsx" df = pandas.read_excel(file_loc) pandas.set_option('display.max_colwidth', -1) workBook = openpyxl.load_workbook("JiraMetaData.xlsx") activeWorkSheet = workBook.get_sheet_by_name('JIRASelectedRawData') nonClusteredJiraList = [] clusteredJiraList = [] nonClusteredJirasAfterClusteringList = [] issueSet = [] clusteredJiraFile = open("clusteredJiras.txt", "w") nonClusteredJiraFile = open("nonClusteredJiras.txt", "w") nonClusteredJirasAfterClusteringFile = open( "nonClusteredJirasAfterClustering.txt", "w") jiraTrainer = Trainer(tokenizer) def getClusteredJiraList(self): for index, row in self.df.iterrows(): clusterName = row['Labels'] keyWords = row['KeyWords'] if (clusterName in constantsObj.INITIAL_CLUSTERS): self.clusteredJiraFile.write("%s --- %s\n" % (keyWords, row['Labels'])) self.clusteredJiraList.append(keyWords) self.clusteredJiraFile.close() return self.clusteredJiraList def getNonClusteredJiraList(self): for index, row in self.df.iterrows(): clusterName = row['Labels'] keyWords = row['KeyWords'] if (clusterName not in constantsObj.INITIAL_CLUSTERS): self.nonClusteredJiraFile.write("%s\n" % (keyWords)) self.nonClusteredJiraList.append(keyWords) self.nonClusteredJiraFile.close() return self.nonClusteredJiraList def classifyNonClusteredJira(self): columnName = 'C' for index, row in self.df.iterrows(): clusterName = row['Labels'] keyWords = row['KeyWords'] if (clusterName in constantsObj.INITIAL_CLUSTERS): self.issueSet.append(({ "class": row['Labels'], "sentence": keyWords })) for issue in self.issueSet: self.jiraTrainer.train(issue['sentence'], issue['class']) jiraClassifier = Classifier(self.jiraTrainer.data, tokenizer) for index, row in self.df.iterrows(): clusterName = row['Labels'] keyWords = row['KeyWords'] if (clusterName not in constantsObj.INITIAL_CLUSTERS): identifiedCluster = jiraClassifier.classify( row['KeyWords']).__getitem__(0) identifiedCluster = identifiedCluster.__getitem__(0) self.issueSet.append(({ "class": identifiedCluster, "sentence": keyWords })) self.nonClusteredJirasAfterClusteringFile.write( "%s --- %s\n" % (keyWords, identifiedCluster)) '''writeIndex = columnName + str(index-2) self.activeWorkSheet[writeIndex] = identifiedCluster''' self.nonClusteredJirasAfterClusteringFile.close() return self.issueSet #self.workBook.save("JiraMetaData.xlsx") def classifyNewJiraToOneOfTheClusters(self, inputTrainingData, inputJira): for item in inputTrainingData: self.jiraTrainer.train(item['sentence'], item['class']) jiraClassifier = Classifier(self.jiraTrainer.data, tokenizer) clusterForInputJira = jiraClassifier.classify(inputJira) return clusterForInputJira
# -*-coding:utf-8-*- import article import jieba import jieba.analyse jieba.set_dictionary('dict.big.txt') from naiveBayesClassifier import tokenizer from naiveBayesClassifier.trainer import Trainer from naiveBayesClassifier.classifier import Classifier # Training articleTrainer = Trainer(tokenizer) articles = article.create_articles_from_file("data/HatePoliticsdata.json") p_train = articles[0:3001] p_test = articles[3001:3031] for a in p_train: doc = a.body #seg_list = jieba.lcut(doc, cut_all=False) seg_list = jieba.analyse.extract_tags(doc) doc = " ".join(seg_list) articleTrainer.train(doc, 'politics') articles = article.create_articles_from_file("data/Gossipingdata.json") g_train = articles[0:3000] g_test = articles[3001:3301] for a in g_train: doc = a.body
""" Suppose you have some texts of news and know their categories. You want to train a system with this pre-categorized/pre-classified texts. So, you have better call this data your training set. """ from naiveBayesClassifier import tokenizer from naiveBayesClassifier.trainer import Trainer from naiveBayesClassifier.classifier import Classifier newsTrainer = Trainer(tokenizer) # You need to train the system passing each text one by one to the trainer module. newsSet =[ {'text': 'not to eat too much is not enough to lose weight', 'category': 'health'}, {'text': 'Russia try to invade Ukraine', 'category': 'politics'}, {'text': 'do not neglect exercise', 'category': 'health'}, {'text': 'Syria is the main issue, Obama says', 'category': 'politics'}, {'text': 'eat to lose weight', 'category': 'health'}, {'text': 'you should not eat much', 'category': 'health'} ] for news in newsSet: newsTrainer.train(news['text'], news['category']) # When you have sufficient trained data, you are almost done and can start to use # a classifier. newsClassifier = Classifier(newsTrainer.data, tokenizer) # Now you have a classifier which can give a try to classifiy text of news whose # category is unknown, yet. classification = newsClassifier.classify("Obama is")