def parse(dir, out_fname): ''' Parse all the inkml files in "dir", and create a csv file(out_fname) containing one symbol per line :param dir: directory containing inkml files :param out_fname: the .csv file to which program writes all samples' features :return: ''' # get parsed samples dictionary: {"label_id": {"strokes": strokes, "label": label} } samples = parse_ink_dir(dir) with open(out_fname, 'w') as records: for label_id in samples: sample = samples[label_id] # preprocessing: # remove duplicates, smoothing, size normalization, resampling preprocessing.process_sample(sample) # generate features for every point in the sample # cosine of vicinity, normalized y-coord, sin of curvature features = feature_generation.gen_feature_vector(sample) for x in features: records.write(str(x) + ',') records.write(label_id + ',') records.write(sample['label']) records.write('\n')
def get_symbol_label(clf, strokes): ''' Gets the label as predicted by the symbol classifier :param clf: classifier model :param strokes: a list of strokes representing the symbol :return: predicted label ''' strokes_lst = list() for s in strokes: strokes_lst.append({"x": s[:, 0], "y": s[:, 1]}) # get symbol features preprocessing.resample(strokes_lst) feature_vec = feature_generation.gen_feature_vector(strokes_lst) # possible class values # class_lst = clf.classes_ # print(class_lst) cls = clf.predict([feature_vec]) # handling predicted comma's if cls[0] == ",": cls[0] = "COMMA" return cls[0]
def rf_test(model_pickle, abs_dir_path, symbols_to_test): ''' Test Random Forest model using the .inkml symbols in file "symbols_to_test" :param model_pickle: The pickled Random Forest model :param abs_dir_path: absolute path to the a parent directory where all .inkml files(valid, junk, other) can be found :param symbols_to_test: a .csv file containing filenames(.inkml) of symbols to test :return: ''' rf_pickle = open(model_pickle, 'rb') rf = pickle.load(rf_pickle) # read all the files in the current directory and sub directories recursively and built a dictionary file_dictionary = dict() for (dirpath, dirnames, filenames) in os.walk(abs_dir_path): for file in filenames: file_dictionary[file] = os.path.join(dirpath, file) with open(symbols_to_test, 'r') as symbols_list: features = [] label_ids = [] # read each symbol file, parse, process, generate features for line in symbols_list: fname = line.strip() abs_fpath = file_dictionary[fname] data = inkml_parser.parse_file(abs_fpath) if data is not None: strokes = data["stroke_list"] ui = data["label_id"] preprocessing.process_sample(strokes) feature_vec = feature_generation.gen_feature_vector(strokes) features.append(feature_vec) label_ids.append(ui) test_features = np.array(features) class_lst = rf.classes_ # predict the probability of this sample being in class 1, class 2 ... cls_probabilities = rf.predict_proba(test_features) # sort probabilities in descending order and get the class indices in cls_list indices = np.argsort(-cls_probabilities) # retain the top 30 predictions for all samples indices = indices[:, :30] # index in the class list n_r, n_c = indices.shape inp_fname = os.path.split(symbols_to_test)[-1] with open('rf_output_' + inp_fname, 'w') as results_file: for r in range(n_r): # find the topmost prediction results_file.write(label_ids[r] + ',') unique_labels = set() # write the top ten results of prediction for c in range(n_c): idx = indices[r, c] # get the label of sample at this index in training dataset label = class_lst[idx] # a tuple (label_id, label) # only put the unique labels in results file if label not in unique_labels: unique_labels.add(label) results_file.write(label + (',' if c < n_c - 1 else '')) if len(unique_labels) == 10: break results_file.write('\n')
def train_classifier(model, ds_type, gt_fname, abs_dir_path, symbols_to_train): ''' Trains specified model(kdtree or randomforest) using the symbols specified in "symbols_to_train" file :param model: string - 'kdtree' or 'rf' (randomforest) :param ds_type: string - 'v' for valid symbols or 'v_j' for valid+junk symbols :param gt_fname: name of file containing records Annotation,label (ground truth of the training set is "symbols_to_train") :param abs_dir_path: The absolute path to the directory where the .inkml files are stored(it can be a root directory) :param symbols_to_train: .csv file containing .inkml filenames of symbol files on which model needs to be trained :return: ''' # read the ground truth file with open(gt_fname, 'r') as gt_file: label_map = {} for line in gt_file: line = line.strip().split( ",") # handle the case where ',' is a label if len(line) == 3: label_id, label = [line[0], ","] else: label_id, label = line label_map[label_id] = label # read all the files in the current directory and sub directories recursively and built a dictionary file_dictionary = dict() for (dirpath, dirnames, filenames) in os.walk(abs_dir_path, followlinks=True): for file in filenames: file_dictionary[file] = os.path.join(dirpath, file) with open(symbols_to_train, 'r') as symbols_list: features = [] label_ids = [] labels = [] for line in symbols_list: fname = line.strip() if not fname.endswith('.inkml'): continue # find the file's absolute path from the dictionary of all files abs_fpath = file_dictionary[fname] data = inkml_parser.parse_file(abs_fpath) strokes = data["stroke_list"] ui = data["label_id"] preprocessing.process_sample(strokes) feature_vec = feature_generation.gen_feature_vector(strokes) features.append(feature_vec) label_ids.append(ui) labels.append(label_map[ui]) print('Data cleaning and prep complete') X = np.array(features) classifier_model = None if model == 'kdtree': # leaf size is the the min number of samples in a given node tree = skl.KDTree( X, leaf_size=60) # Train the classifier, build the tree classifier_model = tree elif model == 'rf': print('Training RandomForest classifier') rf = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=10) rf.fit(X, labels) classifier_model = rf print('Training complete') # store the model parameters in csv file (kdTree_parameters.csv) output_pickle_fname = model + '_' + ds_type + '.pklz' with gzip.open(output_pickle_fname, 'wb') as model_file: pickle.dump(classifier_model, model_file) print('Trained', model, 'classifier model stored in file:', output_pickle_fname)
def kdtree_test(model_pickle, tr_gt, abs_dir_path, symbols_to_test): ''' Test KDTree model using the inkml symbols in file "symbols_to_test" :param model_pickle: The pickled KDTree model :param tr_gt: ground truth file of training symbols which were used to train this model :param abs_dir_path: absolute path to the a parent directory where all .inkml files(valid, junk, others) can be found :param symbols_to_test: a .csv file containing filenames(.inkml) of symbols to test :return: ''' kdtree_model = open(model_pickle, 'rb') tree = pickle.load(kdtree_model) training_labels = [] with open(tr_gt, 'r') as training_GT: for line in training_GT: UI, label = line.strip().split(',') training_labels.append(label) num_neighbours = 30 # read all the files in the current directory and sub directories recursively and built a dictionary file_dictionary = dict() for (dirpath, dirnames, filenames) in os.walk(abs_dir_path): for file in filenames: file_dictionary[file] = os.path.join(dirpath, file) with open(symbols_to_test, 'r') as symbols_list: features = [] label_ids = [] for line in symbols_list: fname = line.strip() abs_fpath = file_dictionary[fname] data = inkml_parser.parse_file(abs_fpath) if data is not None: strokes = data["stroke_list"] ui = data["label_id"] preprocessing.process_sample(strokes) feature_vec = feature_generation.gen_feature_vector(strokes) features.append(feature_vec) label_ids.append(ui) test_features = np.array(features) # query the tree for k nearest neighbors dist, indices = tree.query(test_features, k=num_neighbours) n_r, n_c = indices.shape inp_fname = os.path.split(symbols_to_test)[-1] with open('kdtree_output_' + inp_fname, 'w') as results_file: for r in range(n_r): # find the label id for this sample results_file.write(label_ids[r] + ',') unique_labels = set() # write the top ten results of prediction for c in range(n_c): idx = indices[r, c] # get the label of sample at this index in training dataset label = training_labels[idx] # a tuple (label_id, label) if label not in unique_labels: unique_labels.add(label) results_file.write(label + (',' if c < n_c - 1 else '')) if len(unique_labels) == 10: break results_file.write('\n')