def main(self): preprocess = PreProcess() self.wordsList, self.wordVectors = preprocess.load_word2vec() data, list_id, labels, types, max_length = preprocess.load_mutations() self.maxSeqLength = max_length + 1 # create dictionary of type and it's respective value in int count = 0 for i in types: dic = {i: count} self.types.update(dic) count = count + 1 self.numClasses = len(self.types) train_df, test_df, labels_train, labels_test = self.split_data( data, labels) ids_train, ids_test = self.create_matrix(train_df, test_df) # Spit out details about data print("\n=================================\nData details:") print("- Training-set:\t{}".format(len(train_df))) print("- Test-set:\t\t{}".format(len(test_df))) print("- Classes:\t\t{}".format(self.types)) print("=================================\n\n") self.train_model(ids_train, labels_train, ids_test, labels_test)
def __init__(self, bus_stop_distance, traffic_light_distance, array_stops, array_trfl): super().__init__() self.bst_dist = bus_stop_distance self.trfl_dist = traffic_light_distance self.array_stops = array_stops self.array_trfl = array_trfl self.prepro = PreProcess()
def execute_ALPR(event): """ runs the full license plate recognition process. function is called when user clicks on the execut button on the gui """ #time the function execution start_time = time.time() root_folder = os.path.dirname(os.path.realpath(__file__)) models_folder = os.path.join(root_folder, 'ml_models') pre_process = PreProcess(imagepath) plate_like_objects = pre_process.get_plate_like_objects() # plotting.plot_cca(pre_process.full_car_image, # pre_process.plate_objects_cordinates) license_plate = license_plate_extract(plate_like_objects, pre_process) if len(license_plate) == 0: return False ocr_instance = OCROnObjects(license_plate) if ocr_instance.candidates == {}: wx.MessageBox("No character was segmented", "Character Segmentation" ,wx.OK|wx.ICON_ERROR) return False # plotting.plot_cca(license_plate, ocr_instance.candidates['coordinates']) deep_learn = DeepMachineLearning() text_result = deep_learn.learn(ocr_instance.candidates['fullscale'], os.path.join(models_folder, 'SVC_model', 'SVC_model.pkl'), (20, 20)) text_phase = TextClassification() scattered_plate_text = text_phase.get_text(text_result) plate_text = text_phase.text_reconstruction(scattered_plate_text, ocr_instance.candidates['columnsVal']) print 'ALPR process took '+ str(time.time() - start_time) + ' seconds' listResult.InsertStringItem(listRow, plate_text) plate_num = Mvrd(plate_text) details = plate_num.get_data() if details == False or details == {}: wx.MessageBox("Vehicle Information could not be retrieved", "Information Retrieval", wx.OK|wx.ICON_ERROR) return False; listResult.SetStringItem(listRow, 1, details['Owner Name']) listResult.SetStringItem(listRow, 2, details['Isssue Date']) listResult.SetStringItem(listRow, 3, details['Expiry Date']) listResult.SetStringItem(listRow, 4, details['Chasis Number']) listResult.SetStringItem(listRow, 5, details['Model']) #db_aspect.save_alpr(plate_text, str(datetime.today()))
def train(config, device, RS='Supervised'): # Init tokenizer. tokenizer = Tokenizer(config.temp_dir, config.jieba_dict_file, config.remove_stopwords, config.stopwords_file, config.ivr) # Init feature index. feature_index = FeatureIndex(config, tokenizer=tokenizer) file_list = [config.labeled_file] if config.extra_train_file is not None: file_list.append(config.extra_train_file) if config.valid_file is not None: file_list.append(config.valid_file) feature_index.build_index(file_list) # Preprocess data. pre_process = PreProcess(config) train_data_dir, valid_data_dir, final_train_file, final_valid_file = pre_process.train_preprocess( ) # Get PyTorch dataset. train_dataset = MixnetDataset(config, train_data_dir, feature_index, tokenizer) valid_dataset = MixnetDataset(config, valid_data_dir, feature_index, tokenizer, True) # Get NER model if necessary and compatible. need_ner = False for (feature, feature_config) in config.feature_config_dict.items(): need_ner = need_ner or ("text" in feature_config.get( "type", "") and feature_config.get("seg_type", "word") == "char" and feature_config.get("ner", False)) if need_ner: logger.info("Enable NER, loading NER model...") # Use predict mode since we cannot train it without tag information. ner_model = NERModel(device, "predict") else: logger.info("Disable NER.") ner_model = None # Get PyTorch data loader. train_data_loader = DataLoader(train_dataset, batch_size=1, shuffle=False, num_workers=config.read_workers) valid_data_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=config.read_workers) # Init model. model = MixNet(config.model_config_dict, config.output_config_dict, feature_index.feature_info_dict, feature_index.label_info_dict, ner_model=ner_model) # Train model. solver = Solver(config, train_data_loader, valid_data_loader, feature_index, model, device, RS) solver.build() solver.train()
def submit(): if request.form['text_input'] == "" or len( request.form['text_input']) < 10: return "Please provide input large enough, Classifier can understand :)" # todo: change column name to be dynamically taken from training file test_data = pd.DataFrame([request.form['text_input']], columns=['Document']) session_id = request.cookies['session'] path = os.path.join(app.config['UPLOAD_FOLDER'], session_id) trained_classifier = [i for i in os.listdir(path) if '.pkl' in i] vectorizer = os.path.join(path, 'tfidf_vectorizer.pk') tfidf_transformer = joblib.load(vectorizer) pre_processor = PreProcess(test_data, column_name='Document') test_data = pre_processor.clean_html() test_data = pre_processor.remove_non_ascii() test_data = pre_processor.remove_spaces() test_data = pre_processor.remove_punctuation() test_data = pre_processor.stemming() test_data = pre_processor.lemmatization() test_data = pre_processor.stop_words() test_data1 = tfidf_transformer.transform(test_data.Document) result = {} for clf in trained_classifier: model = joblib.load(os.path.join(path, clf)) print(clf, model.predict(test_data1)[0]) classifier_name = clf.split('/')[-1].split('.')[0] result[classifier_name] = model.predict(test_data1)[0] print(result) return render_template('results.html', result=result)
def execute_ALPR(): """ runs the full license plate recognition process. function is called when user clicks on the execute button on the gui """ # time the function execution start_time = time.time() root_folder = os.path.dirname(os.path.realpath(__file__)) models_folder = os.path.join(root_folder, 'ml_models') # import requests # r = requests.get(url, auth=('admin', 'admin')) # file = open(imagepath, "w") # file.write(r.content) # file.close() imagepath=url pre_process = PreProcess(imagepath) plate_like_objects = pre_process.get_plate_like_objects() # plotting.plot_cca(pre_process.full_car_image, # pre_process.plate_objects_cordinates) license_plate = license_plate_extract(plate_like_objects, pre_process) if len(license_plate) == 0: return False ocr_instance = OCROnObjects(license_plate) if ocr_instance.candidates == {}: # print("No Characters Was Segmented") # wx.MessageBox("No character was segmented", # "Character Segmentation" ,wx.OK|wx.ICON_ERROR) return False # plotting.plot_cca(license_plate, ocr_instance.candidates['coordinates']) deep_learn = DeepMachineLearning() text_result = deep_learn \ .learn(ocr_instance.candidates['fullscale'], os.path.join(models_folder, 'SVC_model', 'SVC_model.pkl'), (20, 20)) text_phase = TextClassification() scattered_plate_text = text_phase.get_text(text_result) plate_text = text_phase.text_reconstruction(scattered_plate_text, ocr_instance.candidates['columnsVal']) # print('ALPR process took ' + str(time.time() - start_time) + ' seconds') print(plate_text)
def __init__(self): self.preprocess = PreProcess() self.CDBG_map = None self.precincts_wards_map = None self.directory_path = 'result' if not os.path.exists(self.directory_path): os.mkdir(self.directory_path) self.refuse_routes_directory_path = os.path.join(self.directory_path, 'refuse_routes') if not os.path.exists(self.refuse_routes_directory_path): os.mkdir(self.refuse_routes_directory_path)
def run(self, hand, rgb=False, box=None, hand_crop=None, depth=None): if rgb and box and hand_crop.any() and depth.any(): phand = self._rgb_hand_seg(hand, box, hand_crop, depth) else: phand = PreProcess().median_smooth(hand, self.MEDIAN_DIM) tmp = phand.copy() cont = self._get_largest_contour(phand) box = cv2.boundingRect(cont) crop = self._crop_box(tmp, box) self._contour = cont self._box = box return (cont, box, crop)
def main(): data = get_data( '/Users/aditya1/Documents/Document_Classification/bbc-dataset') ############################################################################### # Data Pre-processing steps ############################################################################### column_name = data.columns[0] # print(column_name) pre_processor = PreProcess(data, column_name) # todo: change code to provide all functions in class definition. pre_processor_operations = ['clean_html'] data = pre_processor.clean_html() data = pre_processor.remove_non_ascii() data = pre_processor.remove_spaces() data = pre_processor.remove_punctuation() data = pre_processor.stemming() data = pre_processor.lemmatization() data = pre_processor.stop_words() ############################################################################### # Feature extraction ############################################################################### train_x, test_x, train_y, test_y = train_test_split(data.Document, data.Category, test_size=0.20) # print(train_x.shape, train_y.shape) # print(test_x.shape, test_y.shape) tfidf_transformer = TfidfVectorizer(min_df=1) train_vectors = tfidf_transformer.fit_transform(train_x) joblib.dump(tfidf_transformer, 'vectorizer.pkl') test_vectors = tfidf_transformer.transform(test_x) print(data.head()) ############################################################################### # Perform classification with SVM, kernel=linear model1 = svm.SVC(kernel='linear') model1.fit(train_vectors, train_y) joblib.dump(model1, 'SVM.pkl') y_pred_class = model1.predict(test_vectors) print(metrics.accuracy_score(test_y, y_pred_class)) print("Prediction score for classifier %s:\n%s\n" % (model1, metrics.accuracy_score(test_y, y_pred_class))) print("Classification report for classifier %s:\n%s\n" % (model1, metrics.classification_report(test_y, y_pred_class))) model2 = MultinomialNB() model2.fit(train_vectors, train_y) joblib.dump(model2, 'MultinomialNB.pkl') y_pred_class = model2.predict(test_vectors) print("Accuracy score:", metrics.accuracy_score(test_y, y_pred_class)) print("Confusion Matrix for classifier %s:\n%s\n" % (model2, metrics.confusion_matrix(test_y, y_pred_class))) print("Classification report for classifier %s:\n%s\n" % (model2, metrics.classification_report(test_y, y_pred_class)))
def validate_test(): from preprocess import PreProcess p = PreProcess() global model, train_indices, val_indices, train_loader, test_loader, optimizer model = MonocularVelocityNN(initial_depth=config["depth"]) dataset = VideoDataLoader(directory=str(Path.cwd() / "data/testprocdir"), delta=config["delta"], Y=p.labels, depth=config["depth"]) # Creating data indices for training and validation splits: dataset_size = len(dataset) val_indices = list( range(1, dataset_size - config["delta"] - config["depth"])) # valid_sampler = Sampler(val_indices) test_loader = DataLoader(dataset, batch_size=config["test_batch_size"], sampler=val_indices) load_weights(Path.cwd() / "data/weights.pt") if not device: raise RuntimeError("Only use model with Cuda") model.to(device) validate(is_labels=False)
def main(self): preprocess = PreProcess() self.wordsList, self.wordVectors = preprocess.load_word2vec() data, list_id, labels, types, max_length = preprocess.load_mutations() self.maxSeqLength = max_length + 1 # create dictionary of type and it's respective value in int count = 0 for i in types: dic = {i: count} self.types.update(dic) count = count + 1 self.numClasses = len(self.types) train_df, test_df, labels_train, labels_test = self.split_data( data, labels) # remove last element to make it even number train_df = train_df[:-1] labels_train = labels_train[:-1] div = int(len(train_df) / self.k) # get K = 5 batches train_1, train_2, train_3, train_4, train_5 = [ train_df[i:i + div] for i in range(0, len(train_df), div) ] labels_1, labels_2, labels_3, labels_4, labels_5 = [ labels_train[i:i + div] for i in range(0, len(labels_train), div) ] ids_test = self.create_matrix_teste(test_df) ids_train1 = self.create_matrix_train(train_1) ids_train2 = self.create_matrix_train(train_2) ids_train3 = self.create_matrix_train(train_3) ids_train4 = self.create_matrix_train(train_4) ids_train5 = self.create_matrix_train(train_5) train_bins = [train_1, train_2, train_3, train_4, train_5] label_bins = [labels_1, labels_2, labels_3, labels_4, labels_5] ids_train = [ ids_train1, ids_train2, ids_train3, ids_train4, ids_train5 ] self.train_model(train_bins, label_bins, ids_train)
def __init__(self, pic_name, flags=0): PreProcess.__init__(self, pic_name, flags) cv2.namedWindow(winname=PreProcessBoard.winname, flags=1) h, w = self.img.shape print(self.img.shape) # self.resize_board(h/2, w/2) cv2.createTrackbar('kernel size', PreProcessBoard.winname, 0, 5, nothing) cv2.createTrackbar('bin thresh', PreProcessBoard.winname, 127, 255, nothing) cv2.createTrackbar('lambda', PreProcessBoard.winname, 0, 100, nothing) while 1: s = cv2.getTrackbarPos('kernel size', PreProcessBoard.winname) thresh = cv2.getTrackbarPos('bin thresh', PreProcessBoard.winname) kernel = np.ones((s, s), np.uint8) lamda = cv2.getTrackbarPos('lambda', PreProcessBoard.winname) # img_mor = cv2.morphologyEx(self.img, op=cv2.MORPH_CLOSE, kernel=kernel) # ret, img_bin = cv2.threshold(self.img, thresh, 255, cv2.THRESH_BINARY) self.restore() self.img = 255 - self.img self.lambda_binary(30 / 100.0) self.binarize(thresh, 255, type=cv2.THRESH_BINARY) self.img = 255 - self.img img1 = self.img.copy() self.restore() self.img = 255 - self.img self.lambda_binary(lamda/100.0) self.binarize(thresh, 255, type=cv2.THRESH_BINARY) # self.gauss_blur(2) # self.morph(True) self.img = 255 - self.img self.img = (self.img+img1)/2 self.binarize(thresh, 255, type=cv2.THRESH_BINARY) # self.binarize(thresh, 255, cv2.THRESH_BINARY_INV) # cv2.imshow(PreProcessBoard.winname, img_mor) key = cv2.waitKey(1) if key == 27: break else: cv2.imshow(PreProcessBoard.winname, self.img) cv2.imwrite('./data/r1.jpg', self.img)
def train(): from preprocess import PreProcess p = PreProcess() global model, train_indices, val_indices, train_loader, test_loader, optimizer model = MonocularVelocityNN(initial_depth=config["depth"]) dataset = VideoDataLoader(directory=str(Path.cwd() / "data/processed"), delta=config["delta"], Y=p.labels, depth=config["depth"]) # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(1, dataset_size - config["delta"] - config["depth"])) split = int(np.floor(config["split"] * dataset_size)) if config["randomize"]: np.random.seed(0) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = DataLoader(dataset, batch_size=config["batch_size"], sampler=train_sampler) test_loader = DataLoader(dataset, batch_size=config["test_batch_size"], sampler=valid_sampler) if config["TRAIN"]: if not device: raise RuntimeError("Only use model with Cuda") model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"]) for epoch in range(1, config["epochs"] + 1): try: train(epoch) validate() except: raise else: load_weights(Path.cwd() / "data/weights.pt") if not device: raise RuntimeError("Only use model with Cuda") model.to(device) validate()
def execute_ALPR(event): """ runs the full license plate recognition process. function is called when user clicks on the execut button on the gui """ #time the function execution start_time = time.time() root_folder = os.path.dirname(os.path.realpath(__file__)) models_folder = os.path.join(root_folder, 'ml_models') pre_process = PreProcess(imagepath) plate_like_objects = pre_process.get_plate_like_objects() # plotting.plot_cca(pre_process.full_car_image, # pre_process.plate_objects_cordinates) license_plate = license_plate_extract(plate_like_objects, pre_process) if len(license_plate) == 0: return False ocr_instance = OCROnObjects(license_plate) #plotting.plot_cca(license_plate, ocr_instance.candidates['coordinates']) deep_learn = DeepMachineLearning() text_result = deep_learn.learn(ocr_instance.candidates['fullscale'], os.path.join(models_folder, 'SVC_model', 'SVC_model.pkl'), (20, 20)) text_phase = TextClassification() scattered_plate_text = text_phase.get_text(text_result) plate_text = text_phase.text_reconstruction(scattered_plate_text, ocr_instance.candidates['columnsVal']) print 'ALPR process took '+ str(time.time() - start_time) + ' seconds' listResult.InsertStringItem(listRow, plate_text) listResult.SetStringItem(listRow, 1, str(datetime.today())) db_aspect.save_alpr(plate_text, str(datetime.today()))
class TestPreProcess(): @classmethod def setup_class(self): image_path = path.join(path.dirname(path.realpath(__file__))) image_path = path.join( path.split(image_path)[0], 'test_images', 'car6.jpg') self.pre_process = PreProcess(image_path) def test_threshold(self): print 'Testing the threshold function' bin_image = self.pre_process.threshold(self.pre_process.full_car_image) assert bin_image.shape == (548, 700)
def interence_main(): CHECKPOINT_PATH = './model/seq2seq_ckpt-7800' preprocess = PreProcess() # with tf.variable_scope('nmt_model', reuse=None): model = NMTModel() test_sentence = 'this is a test.' test_sentence = preprocess.english2id(test_sentence) print(preprocess.id2english(test_sentence)) output_op = model.inference(test_sentence) sess = tf.Session() saver = tf.train.Saver() saver.restore(sess, CHECKPOINT_PATH) with tf.variable_scope('nmt_model', reuse=None): model = NMTModel() test_sentence = 'this is a test.' test_sentence = preprocess.english2id(test_sentence) output_op = model.inference(test_sentence) sess = tf.Session() saver = tf.train.Saver() saver.restore(sess, CHECKPOINT_PATH) output = sess.run(output_op) print(output) print(preprocess.id2chinese(output)) sess.close()
class TestPreProcess(): @classmethod def setup_class(self): image_path = path.join(path.dirname(path.realpath(__file__))) image_path = path.join( path.split(image_path)[0], 'test_images', 'car6.jpg') self.image_array = imread(image_path, as_grey=True) self.pre_process = PreProcess(image_path) def test_resize_if_necessary(self): print 'Testing the resize function' resized_image = self.pre_process.resize_if_necessary(self.image_array) assert resized_image.shape == (470, 600)
def read_process_data(path, files_path): data = pd.read_csv(path) column_name = data.columns[0] # print(column_name) pre_processor = PreProcess(data, column_name) # todo: change code to provide all functions in class definition. data = pre_processor.clean_html() data = pre_processor.remove_non_ascii() data = pre_processor.remove_spaces() data = pre_processor.remove_punctuation() data = pre_processor.stemming() data = pre_processor.lemmatization() data = pre_processor.stop_words() train_x, test_x, train_y, test_y = train_test_split(data.Document, data.Category, test_size=0.20) tfidf_transformer = TfidfVectorizer(min_df=1) train_vectors = tfidf_transformer.fit_transform(train_x) vectorizer_path = os.path.join(files_path, 'tfidf_vectorizer.pk') joblib.dump(tfidf_transformer, vectorizer_path) return train_vectors, train_y
def train(start=1, end=31): # preprocess training data preprocess = PreProcess() preprocess.set_date_range(start, end) preprocess.process() la = LineAnalysis() la.set_date_range(start, end) la.process() # train model train = Train() train.fit() return
def main(): dowload_dataset() preprocess = PreProcess() df = preprocess.load_tsv() features = preprocess.clean_data(df) features = preprocess.balance_data(features, 15000) x_train, y_train, x_test, y_test = preprocess.split_data(features) y_train_round, y_test_round = preprocess.round_labels(y_train, y_test) y_train_one, y_test_one = preprocess.labels_to_one_hot( y_train_round, y_test_round) lr = Logistic_Regression() lr.fit_and_evaluate(x_train, y_train_round, x_test, y_test_round) nn1 = NeuralNetwork1() nn1.fit_and_evaluate(x_train, y_train_one, x_test, y_test_one) linr = Linear_Regression() linr.fit_and_evaluate(x_train, y_train, x_test, y_test) nn2 = NeuralNetwork2() nn2.fit_and_evaluate(x_train, y_train, x_test, y_test) print("\n___________________End of the output___________________")
def __init__(self): super().__init__() self.folderPath = "textData" # without transforming string # List structure: [[data1][data1_label][data2][data2_label][data3][data3_label]] #self.lemmatizedList = self.preProcess.load_data1(self.folderPath) #Extract data #self.dataList = self.getDataList() #print(self.dataList.__len__()) #self.allWords = self.extractSentence() #print(self.allWords.__len__()) #Extract label #self.Labels = self.extractLabels() #print(self.Labels.__len__()) # transform lemmatized_list(3 generators) into a long string separate by space #self.sparseWords = self.preProcess.cleanByFrequency(self.allWords) # filter by sequence #self.cleanedSentences = self.preProcess.getCleanedSent(self.sparseWords,self.dataList) '''Pre Process''' self.preProcess = PreProcess(self.folderPath) '''i = 0 for x in self.preProcess.cleanedSentences: i+=1 print(str(i) + ": ") print(x)''' # vectorise remnant of sentences #self.X_train = self.preProcess.vector_Data(self.cleanedSentences,self.Labels) '''Process Sentence Level''' self.sentenceLevel = SentenceLevel(self.preProcess.cleanedSentences) '''Process Document Level''' # Start TensorFlow Session self.documentLevel = DocumentLevel(self.sentenceLevel.docInput,self.preProcess.Labels) # display results #self.displayData() '''Summary All Process'''
from keras.models import Sequential from keras.layers import Dense, Dropout, Activation, Flatten from keras.layers import Convolution2D, MaxPooling2D from keras.utils import np_utils # spark, elephas from keras.optimizers import SGD, Adam # classes from preprocess import PreProcess from file_io import FileIO import datetime ## MODEL ## p = PreProcess('./datasets/ag_dataset.txt') x_train, x_test, y_train, y_test, num_classes = p.run() # Convert class vectors to binary class matrices y_train = np_utils.to_categorical(y_train, num_classes) y_test = np_utils.to_categorical(y_test, num_classes) # Reshape dimension = x_train.shape[1] x_train = x_train.reshape(x_train.shape[0], 1, dimension, 1) x_test = x_test.reshape(x_test.shape[0], 1, dimension, 1) print('# Training Data', x_train.shape, y_train.shape) print('# Testing Data', x_test.shape, y_test.shape) # model config epoch_step = 10 pool_size = (1, 2)
from corpus_reader import CorpusReader from preprocess import PreProcess from tf_idf import TfIdf from knn import KNN from metrics import MetricsGenerator from pprint import pprint as pp if __name__ == '__main__': print('reading...') reader = CorpusReader() reader.run() parser = PreProcess() parsed_trainning_documents = {} print('processing...') for k, v in reader.train.items(): parsed_trainning_documents[k] = parser.process(v) # Entrada para o tf-idf, devemos anotar os documentos com suas classes. # Receberá como entrada um array de tuplas: ([tokens], classe) parsed_trainning_documents_with_classes = [] for k in parsed_trainning_documents.keys(): parsed_trainning_documents_with_classes += [(v, k) for v in parsed_trainning_documents[k]] # Execução tf-idf print('generating tf.idf...') tf_idf_calculator = TfIdf(parsed_trainning_documents_with_classes) tf_idf_calculator.run() # testa os parâmetros do knn: métrica de distância e valor de K for metric in ['cosine', 'euclid']:
from preprocess import PreProcess import numpy as np import logging import csv from sklearn.svm import SVC if __name__ == "__main__": logging.basicConfig( format="--%(asctime)s:[%(levelname)s]:%(lineno)d:%(message)s", datefmt="%Y/%m/%d %H:%M:%S", level=logging.INFO ) logging.info("train_test.py Start") SAMPLE = 50000 p = PreProcess() train_filepath = "data/train_1M.csv.out" # train_filepath = 'data/train_1000.csv.out' # test_filepath = 'data/test.csv.out' test_filepattern = "data/test_%d_M.out" # Load train data logging.info("Loading train set...") X_train, y_train = p.load_train_data(train_filepath) # Sampling if y_train.shape[0] > SAMPLE: X_train = X_train[:SAMPLE] y_train = y_train[:SAMPLE] else: SAMPLE = y_train.shape[0] logging.info("Sampling %d" % SAMPLE) logging.info("Shape X_train = %r, y_train =%r" % (X_train.shape, y_train.shape))
from preprocess import PreProcess import numpy as np import logging import csv from sklearn.neighbors import KNeighborsClassifier from sklearn.grid_search import GridSearchCV if __name__ == "__main__": logging.basicConfig(format='--%(asctime)s:[%(levelname)s]:%(lineno)d:%(message)s', datefmt='%Y/%m/%d %H:%M:%S', level=logging.INFO) logging.info("train_knn_cv.py Start") p = PreProcess() #out_filepath = 'data/train_1M.csv.out' #out_filepath = 'data/train_s404_10K.out' out_filepath = 'data/train_1000.csv.out' #Load data #X, y = p.load_train_data(out_filepath) #Load data with category X, y, enc, map_dict = p.load_train_data(out_filepath, category = True) logging.info("Shape X = %r, y =%r" %(X.shape, y.shape )) logging.info("example X = %s\ny =%r" %(X[0], y[0])) logging.info("classes: %r" % list(np.unique(y))) #Sampling #At least 3 POWER = 6 CONST = 1 #At least 2 n_subsamples = CONST*10**POWER n_size = y.shape[0]
from preprocess import PreProcess import numpy as np import logging import csv from sklearn import linear_model from sklearn.grid_search import GridSearchCV if __name__ == "__main__": logging.basicConfig(format='--%(asctime)s:[%(levelname)s]:%(lineno)d:%(message)s', datefmt='%Y/%m/%d %H:%M:%S', level=logging.INFO) logging.info("train_sgdc_cv.py Start") p = PreProcess() out_filepath = 'data/train_1M.csv.out' #out_filepath = 'data/train_s404_100K.out.1vs1' #out_filepath = 'data/train_1000.csv.out' #Load data X, y = p.load_train_data(out_filepath) logging.info("Shape X = %r, y =%r" %(X.shape, y.shape )) logging.info("example X = %s\ny =%r" %(X[0], y[0])) logging.info("classes: %r" % list(np.unique(y))) #Sampling #At least 3 POWER = 6 CONST = 1 #At least 2 #CV = 5 n_subsamples = CONST*10**POWER n_size = y.shape[0] if n_subsamples < n_size:
def run_pipeline(self, feat_extract_name, X_train, y_train, X_test=None, y_test=None): if self.DEBUG >= 1: print "training..." antes1 = datetime.now() pproc = PreProcess( feat_extract_name=feat_extract_name, n_processes=self.n_processes_pproc, size_percentage=self.size_percentage, roi=self.roi, high_pass=self.high_pass, low_pass=self.low_pass, gauss_noise=self.gauss_noise, feature_extractor__method=self.lbp__method, feature_extractor__n_tiles=self.lbp__n_tiles, feature_extractor__n_filters=self.n_filters, feature_extractor__shape_norm=self.shape_norm, feature_extractor__shape_conv=self.shape_conv, feature_extractor__shape_pool=self.shape_pool, feature_extractor__stride_pool=self.stride_pool, feature_extractor__stoc_pool=self.stoc_pool, feature_extractor__div_norm=self.div_norm, feature_extractor__region_shape=self.region_shape, feature_extractor__region_stride=self.region_stride, feature_extractor__top_regions=self.top_regions, feature_extractor__stride_pool_recurrent=self.stride_pool_recurrent, feature_extractor__analysis_shape=self.analysis_shape, multi_column=self.multi_column, augmentation=self.augmentation, aug_rotate=self.aug_rotate, ) norm = preprocessing.StandardScaler(copy=True) piplist = [] if self.cross_validation: piplist.append(("pproc", pproc)) piplist.append(("norm", norm)) if self.use_pca: pca = RandomizedPCA2( whiten=True, random_state=0, n_components=self.pca__n_components, copy=True ) # Must use fit_transform instead of fit() and then transform() when copy=false # from sklearn.decomposition import PCA # pca = PCA(whiten=True, n_components=self.pca__n_components, copy=True) # pca = FastICA(whiten=True,random_state=0, n_components=self.pca__n_components, max_iter=400) # pca = SparsePCA(random_state=0, n_components=self.pca__n_components) #Must use fit_transform instead of fit() and then transform() when copy=false piplist.append(("pca", pca)) if self.use_lda: lda = LDA(n_components=self.lda__n_components) piplist.append(("lda", lda)) if self.predict.lower() == "svm": if self.svm__kernel.lower() == "rbf": pred = SVC2( kernel="rbf", class_weight="auto", random_state=0, C=self.svm__C, gamma=self.svm__gamma, multi_column=self.multi_column, augmentation=self.augmentation, aug_rotate=self.aug_rotate, ) else: pred = LinearSVC2( random_state=0, fit_intercept=False, class_weight="auto", C=self.svm__C, augmentation=self.augmentation, ) elif self.predict.lower() == "sgd": pred = SGD2( loss="hinge", penalty="l2", l1_ratio=0.05, random_state=0, n_iter=5, shuffle=True, augmentation=self.augmentation, alpha=self.sgd__alpha, ) elif self.predict.lower() == "knn": pred = KNeighborsClassifier(n_neighbors=self.knn__n_neighbors, weights=self.knn__weights) piplist.append(("pred", pred)) pipeline = Pipeline(piplist) if self.cross_validation: params_grid = self.params_auto.copy() params_grid.update(self.dicPredict[self.predict]) params_grid.update(self.params_pproc) if feat_extract_name.lower() != "none": params_grid.update(self.dicfeat_extract[feat_extract_name]) pipelineGridSearch = GridSearchCV2( pipeline, params_grid, cv=self.n_folds, verbose=0, n_jobs=self.n_processes_cv, n_jobs_last_estimator=self.n_processes_cv_last_estimator, augmentation=self.augmentation, auto_adjust_params=None, testing=self, ) pipelineGridSearch.fit(X_train, y_train) # gridsearchRef = GridSearchCV(pipeline, params_grid, cv=self.n_folds, iid=True, scoring = 'roc_auc', verbose=0, n_jobs=1) # gridsearchRef.fit(X_train,y_train) # print 'ReF=== score=', gridsearchRef.best_score_,'params=', gridsearchRef.best_params_ return pipelineGridSearch.best_score_, pipelineGridSearch.best_params_ else: antes = datetime.now() X_train = pproc.transform(X_train) antes2 = datetime.now() X_test = pproc.transform(X_test) time_pproc = datetime.now() - antes2 if self.multi_column: y_pred_train = [] y_pred_test = [] if self.aug_rotate: multiply = 30 else: multiply = 10 for i in range(multiply): pipeline = pipeline.fit(X_train[i::multiply, :], y_train) y_pred_train.append(pipeline.predict(X_train[i::multiply, :])) y_pred_test.append(pipeline.predict(X_test[i::multiply, :])) y_pred_train = np.mean(np.asarray(y_pred_train), axis=0) y_pred_train[y_pred_train >= 0] = 1 y_pred_train[y_pred_train < 0] = 0 y_pred_test = np.mean(np.asarray(y_pred_test), axis=0) y_pred_test[y_pred_test >= 0] = 1 y_pred_test[y_pred_test < 0] = 0 else: pipeline = pipeline.fit(X_train, y_train) # save the classifier with open( self.temp_dir + "clf_" + self.datasettrain.lower() + "_" + self.sensortrain.lower() + "_" + self.feat_extract_name.lower() + ".pkl", "wb", ) as output: pickle.dump(pipeline, output, pickle.HIGHEST_PROTOCOL) y_pred_train = pipeline.predict(X_train) antes2 = datetime.now() y_pred_test = [] for i in range(0, len(X_test), self.mini_batch_size_test): y_pred_test.extend(list(pipeline.predict(X_test[i : i + self.mini_batch_size_test]))) test_time = (datetime.now() - antes2) + time_pproc print "Tempo Predict= ", test_time # DEBUG print "Numero de amostras", str(len(X_test)) score_training = 100.0 - (100.0 * roc_auc_score(y_train, y_pred_train)) print "score_training=", score_training score = 100.0 - (100.0 * roc_auc_score(y_test, np.asarray(y_pred_test))) total_time = datetime.now() - antes # DEBUG pca = pipeline.steps[-2][1] pca_total_variance = None if hasattr(pca, "explained_variance_ratio_"): pca_total_variance = np.sum(pca.explained_variance_ratio_) pred = pipeline.steps[-1][1] n_support_ = None if hasattr(pred, "n_support_"): n_support_ = pred.n_support_ self.append_results( params=None, score_mean=score, score_std=None, total_time=total_time, test_time=test_time, n_test_samples=str(len(X_test)), score_training=score_training, n_svm_vectors=n_support_, pca_total_variance=pca_total_variance, ) return score, None if self.DEBUG >= 1: print "Tempo Fit Pipeline= ", (datetime.now() - antes1) # DEBUG
def post(self): popu.drop abc = PreProcess() return {'inserted':'everything'},301
class Labels(PreprocessData): def __init__(self, bus_stop_distance, traffic_light_distance, array_stops, array_trfl): super().__init__() self.bst_dist = bus_stop_distance self.trfl_dist = traffic_light_distance self.array_stops = array_stops self.array_trfl = array_trfl self.prepro = PreProcess() def add_bus_stop_label(self, data): ''' this method is used with multiprocessing item[4] is the velocity''' chunck = [] for items in tqdm(data): final_item = [] for item in items: for stop in self.array_stops: # dist = self.prepro.distance_in_meters([item[0],item[1]], [stop[4],stop[5]]) dist = self.prepro.distance_in_meters([item[0], item[1]], [stop[1], stop[2]]) if item[4] < 5 and dist < self.bst_dist: print('bustop') item.append('bus_stop') break final_item.append(item) chunck.append(final_item) return chunck def add_traffic_light_label(self, data): chunck = [] for items in tqdm(data): final_item = [] for item in items: for stop in self.array_trfl: # dist = self.prepro.distance_in_meters([item[0],item[1]], [stop[7],stop[8]]) dist = self.prepro.distance_in_meters([item[0], item[1]], [stop[1], stop[2]]) if item[4] < 5 and dist < self.trfl_dist and item[ 10] != 'bus_stop': item[10] = 'traffic_light' break final_item.append(item) chunck.append(final_item) return chunck def add_other_stop_label(self, data): for items in tqdm(data): for item in items: if item[4] < 5 and item[10] == 'in_route': item[10] = 'other_stop' def get_false_labels(self, data, label, min_dist): ''' Remove labels other_stop that is between bus or traffic_light''' count_b, count_a = [], [] for items in tqdm(data): for idx in range(len(items) - 1): if idx > 0 and idx < (len(items) - 1): lat_lng_b = [items[idx - 1][0], items[idx - 1][1]] lat_lng_a = [items[idx + 1][0], items[idx + 1][1]] lat_lng_c = [items[idx][0], items[idx][1]] if items[idx][16]==label and ((items[idx-1][16]==0.0 or items[idx-1][16]==3.0)\ and (items[idx+1][16]==0.0 or items[idx+1][16]==3.0))\ and (self.prepro.distance_in_meters(lat_lng_c, lat_lng_b)<min_dist or self.prepro.distance_in_meters(lat_lng_c, lat_lng_a)<min_dist): print( f'before:{items[idx-1][16]}----current:{items[idx][16]}----after:{items[idx+1][16]}' ) print( f'before:{self.prepro.distance_in_meters(lat_lng_c, lat_lng_b)}----after:{self.prepro.distance_in_meters(lat_lng_c, lat_lng_a)}' ) count_b.append( self.prepro.distance_in_meters( lat_lng_c, lat_lng_b)) count_a.append( self.prepro.distance_in_meters( lat_lng_c, lat_lng_a)) items[idx][16] = -1
def setup_class(self): image_path = path.join(path.dirname(path.realpath(__file__))) image_path = path.join( path.split(image_path)[0], 'test_images', 'car6.jpg') self.image_array = imread(image_path, as_grey=True) self.pre_process = PreProcess(image_path)
# -*- coding: utf-8 -*- """ @author: Samip """ from preprocess import PreProcess from classifier import Classifier from modelEvaluation import ModelEvaluation #Create the objects pp = PreProcess() classifier = Classifier() me = ModelEvaluation() #Preprocess the data x_train, x_test, y_train, y_test = pp.scale_data() choice = int( input( "Enter 1 for Logistic Regression, 2 for Decision Tree Classifier, 3 for KNN, 4 for Naive Bayes, 5 for Random Forest, 6 for SVM, 7 for XG Boost, 8 for Adaptive Boosting, 9 for LDA: " )) clf = { 1: classifier.logistic_regression, 2: classifier.decision_tree_classifer, 3: classifier.knn, 4: classifier.naive_bayes, 5: classifier.random_forest, 6: classifier.svm, 7: classifier.xg_boost, 8: classifier.ada_boost,
from preprocess import PreProcess import time start_time = time.time() thisTime = start_time files = [] dataFolder = os.path.dirname(os.path.abspath(__file__)) + "/data" resultFolder = os.path.dirname(os.path.abspath(__file__)) + "/result" count = 0 commonWordList = {} for i in os.listdir(dataFolder): if i.endswith('.txt'): thisFile = os.path.join(dataFolder, i) reflection = open(thisFile, "r", encoding="utf8") processData = PreProcess(reflection.read()) wordList = processData.getWordList(reflection.read(), True) wordFrequency = processData.wordFrequency(wordList) for wordTuple in wordFrequency: commonWordList[wordTuple[0]] = commonWordList[ wordTuple[0]] + wordTuple[1] if wordTuple[ 0] in commonWordList else wordTuple[1] print("--- %s seconds ---" % (time.time() - thisTime)) thisTime = time.time() reflection.close() result = open(resultFolder + "/wordfrequency.csv", "a+") result.write("Word,WordCount\n")
pca = RandomizedPCA(n_components=2) X = pca.fit_transform(X) fig = pyplot.figure() pyplot.plot(X[y==False,0],X[y==False,1],'ro') pyplot.plot(X[y==True,0],X[y==True,1],'bo') pyplot.title('2D Visualization, Crossmatch LivDet 2013 Testing, ConvNet 5 Layers+PCA') pyplot.show() """ testing = Testing() testing.divide_by = 5 testing.n_processes_pproc =3 lstFilesX,y = testing.load_dataset('Training', 'LivDet2011', 'digital') #PCA only pproc = PreProcess('',1,False,False,False,False,1.0,None,\ None,None,None,None,None,None,None,None,None) X = pproc.transform(lstFilesX) pca = RandomizedPCA(n_components=2) X = pca.fit_transform(X) fig = pyplot.figure() pyplot.plot(X[y==False,0],X[y==False,1],'ro') pyplot.plot(X[y==True,0],X[y==True,1],'bo') pyplot.title('2D Visualization, Digital LivDet 2011 Training, PCA only') pyplot.show() #LBP+PCA pproc = PreProcess('LBP',1,False,False,False,False,1.0,None,\ None,None,None,None,None,None,'uniform',[7,7],False) X = pproc.transform(lstFilesX) pca = RandomizedPCA(n_components=2) X = pca.fit_transform(X)
""" Created on Wed May 27 07:11:18 2020 @author: rusha """ import models import shap import pathlib from preprocess import PreProcess import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score preproc = PreProcess() preproc.splitStandardize() X_train, _, y_train, y_test = models.Model('tree').getData() estimator = models.Model('tree').getXGBModel() estimator.fit(X_train, y_train) importances = estimator.feature_importances_ cwd = pathlib.Path().absolute() indices = np.argsort(importances)[::-1] cols = X_train.columns.tolist() std = np.std([tree.feature_importances_ for tree in estimator.estimators_], axis=0) tempcols = [] for f in range(X_train.shape[1]): print("%d. feature %s (%f)" % (f + 1, cols[indices[f]], importances[indices[f]]))
class NB(): classification = {} number_of_docs = None vocab_unique = None classes = None number_of_docs = None inputData = None vocab_unique = None classes = None words = {} process = PreProcess() testing = {} train_file = None test_file = None def __init__(self, train=None, test=None, train_dir=None, test_dir=None): """ :aim: Classify for either a file or given directory :param train: Train file if a file is given :param test: Test file is a file is given :param train_dir: Train directory if directory is given :param test_dir: Test directory if directory is given """ if train is not None or test is not None: self.train_file = train self.test_file = test self.number_of_docs = self.process.getNumberOfDocs(train) self.inputData = self.process.processFile(train) self.vocab_unique = self.process.createVocab(self.inputData) self.classes = self.process.getClasses(self.inputData) else: for i in os.listdir(train_dir + "neg"): self.number_of_docs += self.process.getNumberOfDocs(train) self.inputData += self.process.processFile(train) self.vocab_unique += self.process.createVocab(self.inputData) self.classes += self.process.getClasses(self.inputData) for i in os.listdir(train_dir + "pos"): self.number_of_docs += self.process.getNumberOfDocs(train) self.inputData += self.process.processFile(train) self.vocab_unique += self.process.createVocab(self.inputData) self.classes += self.process.getClasses(self.inputData) """ for i in os.listdir(test_dir + "neg"): for i in os.listdir(test_dir + "pos"): """ def classify(self): """ :aim: Classify training :return: None """ for c in self.classes: temp_dict = {} for word in self.vocab_unique: temp_dict[word] = 0 self.classification[c] = temp_dict # keys = classes, values = count of words in document given class for key in self.classification: for line in self.inputData: if line[len(line) - 1] != key: continue else: for word in line: if word not in self.classification[key]: continue (self.classification[key])[word] += 1 def count_words(self): """ :aim: Count words per class :return: None """ for cl in self.classes: count = 0 for k in self.classification[cl]: count += self.classification[cl][k] self.words[cl] = count def classify_smoothed(self): """ :aim: Add one smoothing to probability :return: None """ self.dict_likelihood = self.classification.copy() for cl in self.classes: for k in self.dict_likelihood[cl]: self.dict_likelihood[cl][k] = (self.dict_likelihood[cl][k] + 1) / float(self.words[cl] + len(self.vocab_unique)) def test_probability(self): """ :aim: Test data on training :return: None """ # Number of documents, given a class for cl in self.classes: d = self.process.numberOfDocsGivenClass(cl, self.inputData) self.process.docs[cl] = d # prior probabilities self.priors = {} for cl in self.classes: if cl not in self.priors: self.priors[cl] = self.process.docs[cl] / float(self.number_of_docs) with open(self.test_file) as f: content = f.readlines() content = [x.strip() for x in content] test_data = content # add classes as keys to testing dict, values = prior probabilities for cl in self.classes: if cl not in self.testing: self.testing[cl] = self.priors[cl] for cl in self.classes: for w in test_data: for key in self.dict_likelihood[cl]: if key != w: continue elif key == w: self.testing[cl] *= self.dict_likelihood[cl][w] # Compute the most likely class self.result = max(self.testing, key=self.testing.get) def output(self): """ :aim: Write final classification output to file :return: None """ string = \ "Total number of documents/lines in input" + str(self.number_of_docs) + "\n" + \ "Vocabulary of unique words: " + str(self.vocab_unique) + "\n"+ \ "Classes:" + str(self.classes) + "\n"+ \ "Count of words, given class: " + str(self.words) + "\n"+ \ "Word likelihoods with add - 1 smoothing with respect to class: " + str(self.dict_likelihood) + "\n"+ \ "Prior probabilities: ", str(self.priors) + "\n" + \ "Probabilities of test data: " + str(self.testing) + "\n" + \ "The most likely class for the test document: " + self.result if self.train_file is not None: f = open("movies-small.txt", "w") f.write("\n".join(string)) f.close() else: #for multiple directory implementation pass