def classify(filename, size): trainingSet, testingSet = make_chronological_sets.create_sets( filename, size) trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) for sample in trainingSet: trainer.train(sample['url'], sample['result']) classifier = Classifier( trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) mal_mal = 0 mal_clean = 0 clean_clean = 0 clean_mal = 0 for sample in testingSet: predicted = classifier.classify(sample['url'])[0][0] actual = sample['result'] if predicted == 'malicious' and actual == 'malicious': mal_mal += 1 elif predicted == 'malicious' and actual == 'clean': mal_clean += 1 elif predicted == 'clean' and actual == 'clean': clean_clean += 1 elif predicted == 'clean' and actual == 'malicious': clean_mal += 1 prop_caught = float(mal_mal) / float(mal_mal + clean_mal) prop_missed = float(clean_mal) / float(mal_mal + clean_mal) ## Stuff to get proportions: # size = float(size) # mal_mal = float(mal_mal)/size # mal_clean = float(mal_clean)/size # clean_mal = float(clean_mal)/size # clean_clean = float(clean_clean)/size ## Confusion matrix stuff: # confusionMatrix = [['Actually malicious', mal_mal, clean_mal], ['Actually clean', mal_clean, clean_clean]] # print tabulate(confusionMatrix, headers=['', 'Predicted malicious', 'Predicted clean']) print "Total: " + str(mal_mal + mal_clean + clean_mal + clean_clean) print "Malware: " + str(mal_mal + clean_mal) print "Clean: " + str(mal_clean + clean_clean) print "Caught: " + str(mal_mal) + " (" + "{:.1%}".format(prop_caught) + ")" print "Missed: " + str(clean_mal) + " (" + "{:.1%}".format( prop_missed) + ")"
def classify(filename, size, url, result): trainingSet = make_training_set.create_set(filename, size) trainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = [""])) for sample in trainingSet: trainer.train(sample['url'], sample['result']) classifier = Classifier(trainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = [""])) print "Expected: " + result print classifier.classify(url)
def create_naive_bayes_classifier(training_examples, training_annotations): print("creating naive bayes classifier") annotations = [categories[x] for x in training_annotations] news_trainer = Trainer( tokenizer.Tokenizer(stop_words=[], signs_to_remove=[tokenization_string])) for example, annotation in zip(training_examples, annotations): news_trainer.train(example, annotation) classifier = Classifier( news_trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[tokenization_string])) print("\t->done") return classifier
class DomainModel: training_data = [] newsTrainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) newClassifier = None def __init__(self): self.train() # TODO: Train on FB data too def train(self): with open('src/URL.csv', 'r') as csv_file: reader = csv_file.readlines() for line in reader: read_dict = {} line_split = line.split(',') if len(line_split) < 2 or len(line_split[0]) == 0: continue read_dict['text'] = line_split[0].strip() read_dict['class'] = line_split[1].strip() self.training_data.append(read_dict) #print training_data for data in self.training_data: self.newsTrainer.train(data['text'], data['class']) self.newsClassifier = Classifier(self.newsTrainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) def classify(self, unknownInstance): classification = self.newsClassifier.classify(unknownInstance) return classification
def __init__(self, journal=None): """Classifer initialization. Parameters: journal_file (str): Journal file string to import. """ self._tknizer = tokenizer.Tokenizer(signs_to_remove=['?!%.']) self._trainer = Trainer(self._tknizer) if journal is not None: journal_data = train_journal(journal) for group in journal_data: # 0: Allocation account. # 1: List of transactions. # 2: Greatest common multiple of values in transactions. for transaction in group[1]: # 0: Transaction payee string. # 1: Allocation account. self._trainer.train(transaction[0], transaction[1]) self._classifier = BayesClassifier( self._trainer.data, self._tknizer ) else: self._classifier = None
def train(self): """Train on base and FB data""" with open('res/data/base_data.csv', 'r') as csv_file: reader = csv.reader(csv_file) i = 0 for line in reader: i += 1 line_split = line read_dict = {} if i == 1 or len(line_split) <= 2 or len(line_split[0]) == 0: continue read_dict['class'] = line_split[2].strip() # Accounting for our inconsistency in Spreadsheet if read_dict["class"] == "Real": read_dict['text'] = line_split[6].strip() else: read_dict['text'] = line_split[5].strip() print(read_dict) self.training_data.append(read_dict) print('---->>>>>><<<<<<<-------') with open('res/data/fb_data.csv', 'r') as csv_file: reader = csv.reader(csv_file) i = 0 for line in reader: i += 1 line_split = line read_dict = {} if i == 1 or len(line_split) <= 2: continue read_dict['class'] = line_split[2].strip() read_dict['text'] = line_split[5].strip() print(read_dict) self.training_data.append(read_dict) #print training_data for data in self.training_data: self.newsTrainer.train(data['text'], data['class']) self.newsClassifier = Classifier( self.newsTrainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=["?!#%&"]))
def classify(filename, size): trainingSet, testingSet = make_balanced_sets.create_sets(filename, size) trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) for sample in trainingSet: trainer.train(sample['url'], sample['result']) classifier = Classifier( trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) mal_mal = 0 mal_clean = 0 clean_clean = 0 clean_mal = 0 for sample in testingSet: predicted = classifier.classify(sample['url'])[0][0] actual = sample['result'] if predicted == 'malicious' and actual == 'malicious': mal_mal += 1 elif predicted == 'malicious' and actual == 'clean': mal_clean += 1 elif predicted == 'clean' and actual == 'clean': clean_clean += 1 elif predicted == 'clean' and actual == 'malicious': clean_mal += 1 size = float(size) mal_mal = float(mal_mal) / size mal_clean = float(mal_clean) / size clean_mal = float(clean_mal) / size clean_clean = float(clean_clean) / size confusionMatrix = [[mal_mal, clean_mal], [mal_clean, clean_clean]] pprint(confusionMatrix) print "Accuracy: " + str(mal_mal + clean_clean) print "False positives (predicted clean when malicious): " + str(clean_mal) print "False negatives (predicted malicious when clean): " + str(mal_clean)
def create_nbc_nb_classifier(training_dataset): training_examples, training_annotations = training_dataset # training_annotations = [int(not bool(annotation)) for annotation in training_annotations] parsed_training_examples = [ set(tokenize(example)) for example in training_examples ] tr = Trainer( tokenizer.Tokenizer(stop_words=[], signs_to_remove=[tokenization_string])) for example, annotation in zip(parsed_training_examples, training_annotations): tr.train(example, annotation) print("number of tokens seen: %s" % len(tr.data.frequencies.keys())) return tr, Classifier( tr.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[tokenization_string]))
def train(self): """Train on base and FB data""" # Run through each training example in data interface and # feed them into model for data_point in self.data_interface.arr: data_class = data_point[2].strip() # Class is "Credibility" data_text = data_point[4].strip() # Text is "Content URL" self.newsTrainer.train(data_text, data_class) self.newsClassifier = Classifier(self.newsTrainer.data, \ tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"]))
def train(self): with open('src/URL.csv', 'r') as csv_file: reader = csv_file.readlines() for line in reader: read_dict = {} line_split = line.split(',') if len(line_split) < 2 or len(line_split[0]) == 0: continue read_dict['text'] = line_split[0].strip() read_dict['class'] = line_split[1].strip() self.training_data.append(read_dict) #print training_data for data in self.training_data: self.newsTrainer.train(data['text'], data['class']) self.newsClassifier = Classifier(self.newsTrainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"]))
def main(): testTrainer = Trainer( tokenizer.Tokenizer(stop_words=[], signs_to_remove=["?!#%&"])) mark_unlabel(unlabeled_csv_link, manual_labeled_link) combine_labeled_data(manual_labeled_link, positive_labeled_link, combined_PU_link) training_data = np.load(combined_PU_link) unlabeled_data = np.load(unlabeled_npy_link) print(unlabeled_data) labels = training_data[:, 0] features = training_data[:, 1:] training_data, test_data = splitDataset(training_data, 0.6) summaries = summarizeByClass(training_data) print(summaries) prediction = getPredictions(summaries, test_data) print(prediction) accuracy = getAccuracy(test_data, prediction) print('Accuracy: {}'.format(accuracy))
class DomainModel: data_interface = [] newsTrainer = Trainer( tokenizer.Tokenizer(stop_words=[], signs_to_remove=["?!#%&"])) newClassifier = None def __init__(self, data_interface): """ Constructor: Store data interface on creation, Don't train yet, let parent decide when """ if not isinstance(data_interface, Data): raise ValueError( "Data is not properly interfaced through class Data") self.data_interface = data_interface def train(self): """Train on base and FB data""" # Run through each training example in data interface and # feed them into model for data_point in self.data_interface.arr: data_class = data_point[2].strip() # Class is "Credibility" data_text = data_point[4].strip() # Text is "Content URL" self.newsTrainer.train(data_text, data_class) self.newsClassifier = Classifier(self.newsTrainer.data, \ tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) def classify(self, unknownInstance): classification = self.newsClassifier.classify(unknownInstance) return classification
def get(self): try: print " " print "TestClassifier start" print " " # pasar los stop words a lista desde el file with open("stop_words.txt", "r") as ins: array = [] for line in ins: array.append((line.rstrip('\n')).decode('unicode-escape')) #print array newsTrainer = Trainer( tokenizer.Tokenizer(stop_words=array, signs_to_remove=["?!#%&_"])) hoy = date.today() query = News3.query(News3.date == hoy, News3.news_from.IN([ "uy_press", ]), News3.category == "Política") # You need to train the system passing each text one by one to the trainer module. #newsSet =[ # {'text': 'not to eat too much is not enough to lose weight', 'category': 'health'}, # {'text': 'Russia try to invade Ukraine', 'category': 'politics'}, # {'text': 'do not neglect exercise', 'category': 'health'}, # {'text': 'Syria is the main issue, Obama says', 'category': 'politics'}, # {'text': 'eat to lose weight', 'category': 'health'}, # {'text': 'you should not eat much', 'category': 'health'} #] query2 = News3.query(News3.date == hoy, News3.news_from == "uy_press", News3.category == "deportes") query4 = News3.query(News3.date == hoy, News3.news_from == "uy_press", News3.category == "salud") #for news in newsSet: # newsTrainer.train(news['text'], news['category']) c = 0 #print query for i in query: print " " print i.category newsTrainer.train(i.html, 'politica') #if c == 10: break c += 1 #for i in query2: # newsTrainer.train(i.html, 'deportes') #raise Exception('I know Python!') #for i in query4: # newsTrainer.train(i.html, 'salud') # When you have sufficient trained data, you are almost done and can start to use # a classifier. # Now you have a classifier which can give a try to classifiy text of news whose # category is unknown, yet. query3 = News3.query( News3.date == hoy, News3.news_from.IN([ "el_pais", ]), News3.id.IN([0]), ) ### newsClassifier = Classifier( newsTrainer.data, tokenizer.Tokenizer(stop_words=array, signs_to_remove=["?!#%&"])) #print unknownInstance classification = newsClassifier.classify( "Vidalín: No quiero que me llamen para saber qué tramite hay que hacer para poner un prostíbulo" ) # the classification variable holds the detected categories sorted print " classification " print(classification) except: print traceback.format_exc()
# import json,os,sys,re from naiveBayesClassifier import tokenizer from naiveBayesClassifier.trainer import Trainer from naiveBayesClassifier.classifier import Classifier newsTrainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])) # You need to train the system passing each text one by one to the trainer module. newsSet =[ {'symptoms': 'pain chest', 'disease': 'hypertensive disease'}, {'symptoms': 'shortness of breath', 'disease': 'hypertensive disease'}, {'symptoms': 'dizziness', 'disease': 'hypertensive disease'}, {'symptoms': 'asthenia', 'disease': 'hypertensive disease'}, {'symptoms': 'fall', 'disease': 'hypertensive disease'}, {'symptoms': 'syncope', 'disease': 'hypertensive disease'}, {'symptoms': 'vertigo', 'disease': 'hypertensive disease'}, {'symptoms': 'sweat sweating increased', 'disease': 'hypertensive disease'}, {'symptoms': 'palpitation', 'disease': 'hypertensive disease'}, {'symptoms': 'nausea', 'disease': 'hypertensive disease'}, {'symptoms': 'angina pectoris', 'disease': 'hypertensive disease'}, {'symptoms': 'pressure chest', 'disease': 'hypertensive disease'}, {'symptoms': 'polyuria', 'disease': 'diabetes'}, {'symptoms': 'polydypsia', 'disease': 'diabetes'}, {'symptoms': 'shortness of breath', 'disease': 'diabetes'}, {'symptoms': 'asthenia', 'disease': 'diabetes'}, {'symptoms': 'nausea', 'disease': 'diabetes'}, {'symptoms': 'orthopnea', 'disease': 'diabetes'}, {'symptoms': 'sweat sweating increased', 'disease': 'diabetes'}, {'symptoms': 'unresponsiveness', 'disease': 'diabetes'},
def classify(filename, size): trainingSet = make_training_set.create_set(filename, size) trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) mal_mal = 0 mal_clean = 0 clean_clean = 0 clean_mal = 0 trainer.train(trainingSet[0]['url'], trainingSet[0]['result']) classifier = Classifier( trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) out = open("mislabeled.txt", "w") for sample in trainingSet[1:]: predicted = classifier.classify(sample['url'])[0][0] actual = sample['result'] if predicted == 'malicious' and actual == 'malicious': mal_mal += 1 elif predicted == 'malicious' and actual == 'clean': mal_clean += 1 elif predicted == 'clean' and actual == 'clean': clean_clean += 1 elif predicted == 'clean' and actual == 'malicious': out.write(sample['url'] + '\n') clean_mal += 1 trainer.train(sample['url'], sample['result']) classifier = Classifier( trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""])) total = float(mal_mal + mal_clean + clean_mal + clean_clean) prop_caught = float(mal_mal + clean_clean) / total prop_missed = float(clean_mal + mal_clean) / total false_positive = float(clean_mal) / float(mal_mal + clean_mal) ## Stuff to get proportions: # size = float(size) # mal_mal = float(mal_mal)/size # mal_clean = float(mal_clean)/size # clean_mal = float(clean_mal)/size # clean_clean = float(clean_clean)/size ## Confusion matrix stuff: # confusionMatrix = [['Actually malicious', mal_mal, clean_mal], ['Actually clean', mal_clean, clean_clean]] # print tabulate(confusionMatrix, headers=['', 'Predicted malicious', 'Predicted clean']) print "Total: " + str(int(total)) print "Malware: " + str(mal_mal + clean_mal) print "Clean: " + str(mal_clean + clean_clean) print "Caught: " + str(mal_mal + clean_clean) + " (" + "{:.1%}".format( prop_caught) + " of all samples)" print "Missed: " + str(clean_mal + mal_clean) + " (" + "{:.1%}".format( prop_missed) + " of all samples)" print "Malicious missed: " + str(clean_mal) + " (" + "{:.1%}".format( false_positive) + " of all malicious samples)"