def setUp(self): # create a dummy ClassifierModel object self.classifier_model = ClassifierModel.objects.create( version=1, data=b"This is just dummy data. PLEASE, DON'T UNPICKLE THIS !!", name="Dummy model") self.classifier_class = SKNaiveBayesClassifier self.data = get_processed_data(csv_path)
def create_classifier_model( version, csv_path, classifier_class=SKNaiveBayesClassifier, confusion_matrix=True ): """ Create a new classifier object to save to the database Parameters ---------- @classifier_class : Classifier class to use to creat model @data : labeled data list [(text, classification), ...] @version : version of the classifier model """ # check if version already exists try: ClassifierModel.objects.get(version=version) raise Exception("Classifier version {} already exists".format(version)) except ClassifierModel.DoesNotExist: pass from helpers.deep import get_processed_data data = get_processed_data(csv_path) # get train, test data train, test = create_train_test_data(data) classifier = classifier_class.new(train) accuracy = classifier.get_accuracy(test) if confusion_matrix: classifier.calculate_confusion_matrix(test) pickle_data = pickle.dumps(classifier) modelobj = ClassifierModel( data=pickle_data, accuracy=accuracy, version=version, name=classifier_class.__name__ ) testfilename = 'test_data_v-{}.pkl'.format(version) filepath = 'model_test_datas/{}'.format(testfilename) with open(filepath, 'wb') as f: f.write(pickle.dumps(test)) modelobj.test_file_path = filepath return modelobj
def main(*args, **kwargs): if not kwargs.get('model_version'): print("Version not provided. Provide it as --modelversion <version>") return csv_path = kwargs.get('path', '_playground/sample_data/processed_new_data.csv') # TODO; check for model name version = kwargs['model_version'] from helpers.deep import get_processed_data # get data data = get_processed_data(csv_path) classifier_model = create_and_save_classifier_model(version, data) print('Classifier {} created successfully with test data'.format( classifier_model ))
def _get_model(self, version): # first create classifier csv_path = 'fixtures/processed_data_for_testing.csv' data = get_processed_data(csv_path) return create_classifier_model(version, data)
def setUp(self): self.test_data = get_processed_data( 'fixtures/processed_data_for_testing.csv') self.train, self.test = create_train_test_data(self.test_data) self.classifier = SKNaiveBayesClassifier.new(self.train) self.classifier.calculate_confusion_matrix(self.test)
logger = logging.getLogger('myapp') hdlr = logging.FileHandler(logfilepath) formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') hdlr.setFormatter(formatter) logger.addHandler(hdlr) logger.setLevel(logging.WARNING) # logfile = open(logfilepath, 'w') num_accuracy = [] try: logger.info('.. GETTING DEEP DATA\n') print('.. GETTING DEEP DATA\n') deepdata = get_processed_data( '_playground/sample_data/processed_sectors_subsectors.csv') logger.info('.. SHUFFLING DEEP DATA\n') print('.. SHUFFLING DEEP DATA\n') random.shuffle(deepdata) total = len(deepdata) logger.info('.. INITIALIZING DATASETSIZE TO 500\n') print('.. INITIALIZING DATASETSIZE TO 500\n') dataset_num = 500 logger.info('.. SETTING SIZE INCREMENT TO 150\n') print('.. SETTING SIZE INCREMENT TO 150\n') increment = 150 # first create dir to store accuracy vs size data logger.info('.. CREATING DIRECTORY `DEEP_DATA` FOR STORING DATA\n')
def test_create_train_test_data(): data = get_processed_data(csv_path) train, test = create_train_test_data(data) assert len(test) == int(len(data) / 4) assert len(train) == int(3 * len(data) / 4)
def test_get_processed_data(): data = get_processed_data(csv_path) assert type(data) == list, "The resulting data should be a list" assert type(data[0]) == tuple, "Should be a tuple" assert len(data[0]) == 2, "Tuple size should be 2"
def main(*args, **kwargs): try: logger.info('.. GETTING DEEP DATA\n') print('.. GETTING DEEP DATA\n') deepdata = get_processed_data( '_playground/sample_data/processed_sectors_subsectors.csv') logger.info('.. SHUFFLING DEEP DATA\n') print('.. SHUFFLING DEEP DATA\n') random.shuffle(deepdata) total = len(deepdata) logger.info('.. INITIALIZING DATASETSIZE TO 500\n') print('.. INITIALIZING DATASETSIZE TO 500\n') dataset_num = 500 logger.info('.. SETTING SIZE INCREMENT TO 150\n') print('.. SETTING SIZE INCREMENT TO 150\n') increment = 150 # first create dir to store accuracy vs size data logger.info('.. CREATING DIRECTORY `DEEP_DATA` FOR STORING DATA\n') print('.. CREATING DIRECTORY `DEEP_DATA` FOR STORING DATA\n') dirpath = os.path.join(os.path.expanduser('~'), 'data_DEEPL') subprocess.call(['mkdir', '-p', dirpath]) filepath = os.path.join(dirpath, 'accuracy_vs_size.txt') logger.info('.. RUNNING LOOP') print('.. RUNNING LOOP') sectors_accuracies = {} while dataset_num <= total: random.shuffle(deepdata) one_fourth = int(dataset_num / 4.0) train = deepdata[:dataset_num][one_fourth:] test = deepdata[:dataset_num][:one_fourth] logger.info('.. dataset_num:{}\n'.format(dataset_num)) classifier = CLASSIFIER.new(train) classifier.calculate_confusion_matrix(test) # calculate accuracy for other indices = classifier.confusion_matrix._indices matrix = classifier.confusion_matrix._confusion if not sectors_accuracies: sectors_accuracies = {k: [] for k, v in indices.items()} for k, v in indices: total = sum(matrix[v]) correct = matrix[v][v] sectors_accuracies[k].append( [dataset_num, correct / float(total)]) accuracy = classifier.get_accuracy(test) num_accuracy.append((dataset_num, accuracy)) logger.info('.. accuracy: {}\n'.format(accuracy)) print('.. accuracy: {}\n'.format(accuracy)) dataset_num += increment # now plot data = num_accuracy x = list(map(lambda x: x[0], data)) y = list(map(lambda x: x[1], data)) print("$$$$$$$$$$$$$$$$$$$") print(data) print("$$$$$$$$$$$$$$$$$$$") print(sectors_accuracies) print("$$$$$$$$$$$$$$$$$$$") fig = plt.figure(figsize=(15, 8)) plt.xticks([x for x in range(500, 28000, 1500)]) plt.xlabel('# of TRAINING SETS') plt.ylabel('ACCURACY') plt.grid(True) plt.plot(x, y, 'k') plt.savefig(str(datetime.datetime.now()) + ".png") logger.info('.. DONE!!!') except Exception as e: import traceback logger.info(traceback.format_exc()) print(traceback.format_exc()) logger.info('\n')