def convert(src, tgt, txt, nativize, preoptions, postoptions): txt = PreProcess.PreProcess(txt, src, tgt) if 'siddhamUnicode' in postoptions and tgt == 'Siddham': tgt = 'SiddhamUnicode' if 'LaoNative' in postoptions and tgt == 'Lao': tgt = 'Lao2' if 'siddhamUnicode' in preoptions and src == 'Siddham': src = 'SiddhamUnicode' if 'egrantamil' in preoptions and src == 'Grantha': src = 'GranthaGrantamil' if 'egrantamil' in postoptions and tgt == 'Grantha': tgt = 'GranthaGrantamil' for options in preoptions: txt = getattr(PreProcess, options)(txt) transliteration = Convert.convertScript(txt, src, tgt) if nativize: transliteration = PostOptions.ApplyScriptDefaults( transliteration, src, tgt) if tgt != 'Tamil': transliteration = PostProcess.RemoveDiacritics(transliteration) else: transliteration = PostProcess.RemoveDiacriticsTamil( transliteration) for options in postoptions: transliteration = getattr(PostProcess, options)(transliteration) return transliteration
def preProc(self): dataCleaner = PreProcess(self.df) self.df = dataCleaner.df #alert user tkMessageBox.showinfo("K Means Clustering", "Preprocessing completed successfully!") pass
def convert(src, tgt, txt, nativize, preoptions, postoptions): txt = PreProcess.PreProcess(txt, src, tgt) if 'siddhammukta' in postoptions and tgt == 'Siddham': tgt = 'SiddhamDevanagari' if 'siddhamap' in postoptions and tgt == 'Siddham': tgt = 'SiddhamDevanagari' if 'siddhammukta' in preoptions and src == 'Siddham': src = 'SiddhamDevanagari' if 'LaoNative' in postoptions and tgt == 'Lao': tgt = 'Lao2' if 'egrantamil' in preoptions and src == 'Grantha': src = 'GranthaGrantamil' if 'egrantamil' in postoptions and tgt == 'Grantha': tgt = 'GranthaGrantamil' if 'nepaldevafont' in postoptions and tgt == 'Newa': tgt = 'Devanagari' if 'ranjanalantsa' in postoptions and tgt == 'Ranjana': tgt = 'Tibetan' nativize = False if 'ranjanawartu' in postoptions and tgt == 'Ranjana': tgt = 'Tibetan' nativize = False for options in preoptions: txt = getattr(PreProcess, options)(txt) transliteration = Convert.convertScript(txt, src, tgt) if nativize: transliteration = PostOptions.ApplyScriptDefaults( transliteration, src, tgt) if tgt != 'Tamil': transliteration = PostProcess.RemoveDiacritics(transliteration) else: transliteration = PostProcess.RemoveDiacriticsTamil( transliteration) for options in postoptions: transliteration = getattr(PostProcess, options)(transliteration) if src == "Tamil" and tgt == "IPA": r = requests.get("http://anunaadam.appspot.com/api?text=" + txt + "&method=2") r.encoding = r.apparent_encoding transliteration = r.text return transliteration
def chooseFile(self, item): for index in range(self.listWidget.count()): if self.listWidget.item(index).text() == item.text(): self.itemIndex = index preProcess = PreProcess.PreProcess() content = preProcess.getArticleContent(repertory + "/" + item.text()) if self.method == 1: self.sents = preProcess.getSents(content) size = len(self.sents) else: size, self.sents = preProcess.getXMLsents(content) self.labelRest.setText('0/' + str(size)) self.file = item.text() self.newSent = [] self.pushButton_save.setDisabled(True)
def RunPreprocess(): print "---PreProcess" PreProcess.PreProcess() print "---PreProcess1" PreProcess1.PreProcess1() print "---PreProcess2" PreProcess2.PreProcess2() print "---PreProcess3" PreProcess3.PreProcess3() print "---PreProcess4,40" PreProcess4.PreProcess4(40) print "---PreProcess4,30" PreProcess4.PreProcess4(30) print "---PreProcess4Base,40" PreProcess4Base.PreProcess4Base(40) print "---PreProcess4Base,30" PreProcess4Base.PreProcess4Base(30)
float(line.rstrip('\n')) for line in open('csvData/train_label.data') ] rawData = open('csvData/test.data', 'rb') temp = np.loadtxt(rawData, delimiter=',') testset = np.c_[np.ones(len(temp)), temp] test_labels = [ float(line.rstrip('\n')) for line in open('csvData/test_label.data') ] ### # pre-process PP = PreProcess.PreProcess(data, n_buckets=10, func='boolean') #,swap_labels=True) data = PP.fit(data) testset = PP.fit(testset) data_labels = PP.processLabels(data_labels) test_labels = PP.processLabels(test_labels) # cross-validation best_C = 2 best_ro = 0.01 best_accuracy = 0 best_epoch = 10 best_g0 = 1.001 ''' for C in [4,2,0.5,0.25,0.125]:#,0.0625,0.03125]:
] f = open('csvData/test.data') temp = [] for line in f: temp.append(line.rstrip().split(',')) testset = temp test_labels = [ float(line.rstrip('\n')) for line in open('csvData/test_label.data') ] ### # pre-process PP = PreProcess.PreProcess(data, n_buckets=5) data = PP.fit(data) testset = PP.fit(testset) # cross-validation best_depth = 3 best_f1 = 0 for d in [10, 20, 30, 40, 50, float('inf')]: tmp = [] dt = DecisionTree.DecisionTree(max_depth=d) kfold = KFold.KFold(n_splits=5) for kf in kfold.split(data): train = [data[i] for i in kf[0]]
def processing(self, event, source_object=None): self.pushButton_choose.setDisabled(True) self.pushButton_keep.setVisible(True) self.pushButton_remove.setVisible(True) self.listWidget.setDisabled(True) global index preProcess = PreProcess.PreProcess() if source_object.objectName() == "pushButton_choose": index = 0 self.textEdit.clear() if self.method == 1: sent = self.sents[index] else: sent = preProcess.getXMLtext(self.sents[index][0]) if self.sents[index][1] == 0: self.pushButton_skip.setVisible(True) self.pushButton_keep.setVisible(False) self.pushButton_remove.setVisible(False) else: self.pushButton_skip.setVisible(False) self.pushButton_keep.setVisible(True) self.pushButton_remove.setVisible(True) self.textEdit.setText(sent) else: sent = "" if self.method == 1: sent = self.sents[index] else: sent = preProcess.getXMLtext(self.sents[index][0]) if self.sents[index][1] == 0: self.pushButton_skip.setVisible(True) self.pushButton_keep.setVisible(False) self.pushButton_remove.setVisible(False) else: self.pushButton_skip.setVisible(False) self.pushButton_keep.setVisible(True) self.pushButton_remove.setVisible(True) self.textEdit.setText(sent) self.labelRest.setText(str(index + 1) + '/' + str(len(self.sents))) if source_object.objectName() == "pushButton_keep": if self.method == 1: self.newSent.append("<source id='" + str(index + 1) + "' operation='S'>\n" + sent + "\n</source>") else: self.newSent.append("<source id='" + str(index + 1) + "' operation='S'>" + sent + "</source>") elif source_object.objectName() == "pushButton_remove": if self.method == 1: self.newSent.append("<source id='" + str(index + 1) + "' operation='R'>\n" + sent + "\n</source>") else: self.newSent.append("<source id='" + str(index + 1) + "' operation='R'>" + sent + "</source>") elif source_object.objectName() == "pushButton_skip": self.newSent.append(str(self.sents[index][0])) index += 1 if index < len(self.sents): if self.method == 1: sent = self.sents[index] else: sent = preProcess.getXMLtext(self.sents[index][0]) if self.sents[index][1] == 0: self.pushButton_skip.setVisible(True) self.pushButton_keep.setVisible(False) self.pushButton_remove.setVisible(False) else: self.pushButton_skip.setVisible(False) self.pushButton_keep.setVisible(True) self.pushButton_remove.setVisible(True) self.textEdit.setText(sent) if index >= len(self.sents): self.pushButton_save.setDisabled(False) self.pushButton_keep.setVisible(False) self.pushButton_remove.setVisible(False) self.pushButton_skip.setVisible(False)
predict_tmp = win.predict(test) tmp.append(Stat.F1_Score(predict_tmp,test_label)) if np.mean(tmp) > best_f1: best_f1 = np.mean(tmp) best_bucket = b best_param = p print("Best result so far >>",best_f1,best_bucket,best_param) print("best bucket:", best_bucket) print("best param:" , best_param) ''' ### PP = PreProcess.PreProcess(data, n_buckets=best_bucket, func='boolean') data = PP.fit(data) testset = PP.fit(testset) #data_labels = PP.processLabels(data_labels) #test_labels = PP.processLabels(test_labels) unbalanced = Winnow.Winnow(param=best_param) unbalanced.fit(data, data_labels) predictTrain = unbalanced.predict(data) predictTest = unbalanced.predict(testset) print("unbalanced:")
# coding: utf-8 # In[6]: import numpy as np import pandas as pd import pickle from PreProcess import * SavedModelFile = "model.pkl" filePath = "test_potus_by_county.csv" X = pd.read_csv(filePath) X = PreProcess(X) try: with open(SavedModelFile, 'rb') as f: model = pickle.load(f) except: print("Did not find a saved model, please run build_model.py") exit() predict = [pred for pred in model.predict(X)] with open('predictions.csv', 'w+') as f: f.write("Winner\n") for pred in predict: f.write(pred + "\n")
parser = argparse.ArgumentParser() parser.add_argument('--morning', help='Folder the .tflite file is located in', default='17:47:0') parser.add_argument('--afternoon', help='Name of the .tflite file, if different than detect.tflite', default='11:38:0') parser.add_argument('--evening', help='Name of the labelmap file, if different than labelmap.txt', default='11:39:0') args = parser.parse_args() set_time = [args.morning, args.afternoon, args.evening] name = '' while True: files= os.listdir('input') preProcess = PreProcess() index_new = preProcess.nameImage() d = datetime.now() cur_time = str(d.hour) +':'+ str(d.minute) + ':'+ str(d.second) #capture for i in range(len(set_time)): if cur_time == set_time[i]: preProcess.captureImage() #resize & name if len(files)>0: for file in files: sleep(3) test = PreProcess(file=file, name=index_new) test.preImages() index_new = index_new +1
] f = open('csvData/test.data') temp = [] for line in f: temp.append(line.rstrip().split(',')) testset = temp test_labels = [ float(line.rstrip('\n')) for line in open('csvData/test_label.data') ] ### # pre-process PP = PreProcess.PreProcess(data, func='boolean', n_buckets=5) data = PP.fit(data) testset = PP.fit(testset) ''' data = [[0,0,1,1],[0,0,1,0],[0,0,0,0],[1,0,1,0],[1,0,1,1],[1,1,1,1]] data_labels = [0,1,2,3,4,5] testset = [[1,0,1,1]] ''' # cross-validation best_norm = 2 best_K = 3 best_f1 = 0 ''' for k in [1,3,5,7]: for p in [1,2,3]:
(x_train, x_test) = SplitSet(x, fold_idx) if validation == True: y_predictions = np.zeros((y_test.shape[0], 1), dtype=np.int32) for sample_proportion_idx in range(int(SAMPLE_PROPORTION_SIZE)): sample_proportion = SAMPLE_PROPORTION_LOWER + sample_proportion_idx * SAMPLE_PROPORTION_INTERVAL for attribute_size_idx in range(VALIDATION_ATTRIBUTE_SIZE): attribute_size = VALIDATION_ATTRIBUTE_SIZE_LOWER + attribute_size_idx * VALIDATION_ATTRIBUTE_SIZE_INTERVAL forestList = [] for class_idx in range(NUM_CLASSES): class_num = class_idx + 1 forestList.append([]) for tree_idx in range(NUM_TREES_IN_FOREST): x_sample, y_sample = sample( x_train, y_train, sample_proportion) train_targets = PreProcess(y_sample, class_num) attribute_set = sample_attributes( range(x.shape[1]), attribute_size) tree = Decision_Tree_Learning( x_sample, attribute_set, train_targets) forestList[class_idx].append(tree) vote_block = np.zeros((x_test.shape[0], NUM_CLASSES)) for forest_idx in range(NUM_CLASSES): choices = decision_forest_vote(forestList[forest_idx], x_test) vote_block[:, forest_idx] = choices y_test_predictions = np.zeros((x_test.shape[0], 1))
with io.open(filename, 'w', encoding="utf-8") as csvfile: # creating a csv writer object csvwriter = csv.writer(csvfile) # writing the fields csvwriter.writerow(fields) # writing the data rows csvwriter.writerows(rows) return None if __name__ == '__main__': # arr = {1:2,3:4,5:1} # print(max(arr.values())) preProcess = PreProcess.PreProcess() eval = Evaluation.Evaluation() # for n in range(40, 60, 2): kmeans = Kmaens.Kmeans(n, preProcess.vectorize_tf_idf()) print(eval.purity(n, kmeans.y, preProcess.labels)) # data_vectors_tf_idf = preProcess.vectorize_tf_idf() # data_vectors_wv = preProcess.word2wec() # optimal_n = len(set(preProcess.labels)) # # Gaussian Mixture Model # print("Gaussian Mixture Model(tf-idf):") # gmm = GMMCluster.GMMCluster(data_vectors_tf_idf[:100], 5) # cluster = gmm.cluster("tf-idf") # print("ARI= ", eval.adjusted_rand_index(preProcess.labels[:100], cluster))
stacked_sample_error_occ = 0 stacked_sample_error_depth = 0 stacked_sample_error_depth_min = 0 stacked_sample_error_occ = 0 for fold_idx in range(0, NUM_FOLDS): # fold fold_num = fold_idx + 1 (y_train_validate, y_test) = SplitSet(y, fold_idx) y_predictions = np.zeros((y_test.shape[0], 1), dtype=np.int32) (x_train_validate, x_test) = SplitSet(x, fold_idx) treeList = [] occurrences = CountOccurrence(y_train_validate, NUM_CLASSES) for tree_class_idx in range(0, NUM_CLASSES): class_num = tree_class_idx + 1 train_targets = PreProcess(y_train_validate, class_num) tree = Decision_Tree_Learning(x_train_validate, range(x_train_validate.shape[1]), train_targets, None) # if class_num != 1: # tree = Decision_Tree_Learning(x_train_validate,range(x_train_validate.shape[1]),train_targets,max_depth) # else: # tree = Decision_Tree_Learning(x_train_validate,range(x_train_validate.shape[1]),train_targets,10) treeList.append(tree) predictions_by_depth = TestTreesByDepth(treeList, x_test, ambiguityHandlingStyle) predictions_by_min_depth = TestTreesByMinDepth(treeList, x_test, ambiguityHandlingStyle) predictions_by_occ = TestTreesByOccurrence(treeList, occurrences, x_test, ambiguityHandlingStyle)
def analyzing(self): CWD_PATH = os.getcwd() output_path = os.path.join(CWD_PATH, 'analyze') preProcess = PreProcess() index = preProcess.nameImage('analyze') # If both an image AND a folder are specified, throw an error if (self.IM_NAME and self.IM_DIR): print('you can only use IM_NAME OR IM_DIR') sys.exit() # If neither an image or a folder are specified, default to using 'test1.jpg' for image name if (not self.IM_NAME and not self.IM_DIR): self.IM_DIR = 'new' # Import TensorFlow libraries # If tensorflow is not installed, import interpreter from tflite_runtime, else import from regular tensorflow # If using Coral Edge TPU, import the load_delegate library pkg = importlib.util.find_spec('tensorflow') if pkg is None: from tflite_runtime.interpreter import Interpreter else: from tensorflow.lite.python.interpreter import Interpreter # Get path to current working directory # Define path to images and grab all image filenames if self.IM_DIR: PATH_TO_IMAGES = os.path.join(CWD_PATH, self.IM_DIR) images = glob.glob(PATH_TO_IMAGES + '/*') elif self.IM_NAME: PATH_TO_IMAGES = os.path.join(CWD_PATH, self.IM_NAME) images = glob.glob(PATH_TO_IMAGES) # Path to .tflite file, which contains the model that is used for object detection PATH_TO_CKPT = os.path.join(CWD_PATH, self.MODEL_NAME, self.GRAPH_NAME) # Path to label map file PATH_TO_LABELS = os.path.join(CWD_PATH, self.MODEL_NAME, self.LABELMAP_NAME) # Load the label map with open(PATH_TO_LABELS, 'r') as f: labels = [line.strip() for line in f.readlines()] # Have to do a weird fix for label map if using the COCO "starter model" from # https://www.tensorflow.org/lite/models/object_detection/overview # First label is '???', which has to be removed. if labels[0] == '???': del (labels[0]) # Load the Tensorflow Lite model. interpreter = Interpreter(model_path=PATH_TO_CKPT) interpreter.allocate_tensors() # Get model details input_details = interpreter.get_input_details() output_details = interpreter.get_output_details() height = input_details[0]['shape'][1] width = input_details[0]['shape'][2] print(width, height) floating_model = (input_details[0]['dtype'] == np.float32) input_mean = 127.5 input_std = 127.5 # Loop over every image and perform detection for image_path in images: leaf = flower = melon = 0 # Load image and resize to expected shape [1xHxWx3] image = cv2.imread(image_path) image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) imH, imW, _ = image.shape image_resized = cv2.resize(image_rgb, (width, height), interpolation=cv2.INTER_AREA) input_data = np.expand_dims(image_resized, axis=0) # Normalize pixel values if using a floating model (i.e. if model is non-quantized) if floating_model: input_data = (np.float32(input_data) - input_mean) / input_std # Perform the actual detection by running the model with the image as input interpreter.set_tensor(input_details[0]['index'], input_data) interpreter.invoke() # Retrieve detection results boxes = interpreter.get_tensor(output_details[0]['index'])[ 0] # Bounding box coordinates of detected objects classes = interpreter.get_tensor(output_details[1]['index'])[ 0] # Class index of detected objects scores = interpreter.get_tensor(output_details[2]['index'])[ 0] # Confidence of detected objects #num = interpreter.get_tensor(output_details[3]['index'])[0] # Total number of detected objects (inaccurate and not needed) # Loop over all detections and draw detection box if confidence is above minimum threshold for i in range(len(scores)): if ((scores[i] > self.min_conf_threshold) and (scores[i] <= 1.0)): # Get bounding box coordinates and draw box # Interpreter can return coordinates that are outside of image dimensions, need to force them to be within image using max() and min() ymin = int(max(1, (boxes[i][0] * imH))) xmin = int(max(1, (boxes[i][1] * imW))) ymax = int(min(imH, (boxes[i][2] * imH))) xmax = int(min(imW, (boxes[i][3] * imW))) cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (10, 255, 0), 2) # Draw label object_name = labels[int( classes[i] )] # Look up object name from "labels" array using class index label = '%s: %d%%' % (object_name, int(scores[i] * 100) ) # Example: 'person: 72%' labelSize, baseLine = cv2.getTextSize( label, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2) # Get font size label_ymin = max( ymin, labelSize[1] + 10 ) # Make sure not to draw label too close to top of window cv2.rectangle( image, (xmin, label_ymin - labelSize[1] - 10), (xmin + labelSize[0], label_ymin + baseLine - 10), (255, 255, 255), cv2.FILLED) # Draw white box to put label text in cv2.putText(image, label, (xmin, label_ymin - 7), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 2) # Draw label text if (object_name == 'leaf'): leaf = leaf + 1 elif (object_name == 'flower'): flower = flower + 1 else: melon = melon + 1 # All the results have been drawn on the image, now display the image print('image', index, ':') print('leaf:', leaf) print('flower:', flower) print('melon:', melon) uploadToFirebase = DbFirebase(leaves=leaf, flowers=flower, melons=melon) uploadToFirebase.add() cv2.imshow('Object detector', image) out = os.path.join(output_path, str(index) + ".jpg") cv2.imwrite(out, image) index = index + 1 # Press any key to continue to next image, or press 'q' to quit cv2.waitKey(1) preProcess.moveImage() # Clean up cv2.destroyAllWindows()