def read_files(directory): """ reads inkml file, extracts features and saves it to csv :param directory: :return: """ files = os.listdir(directory) print(len(files)) pre = Preprocessing() feature_matrix = [] total = len(files) completed = 0 gt_c = 0 for file in files[0:]: print("Processing file : ", file, " Remaining files : ", total-completed, " Completed files : ", completed) f = open(os.path.join(directory, file)) soup = bs.BeautifulSoup(f, 'html.parser') trace_groups = soup.find_all('tracegroup') for tracegroup in trace_groups[1:]: traceview = tracegroup.find_all('traceview') trace_id = [] for t in traceview: trace_id.append(t['tracedataref']) gt = tracegroup.annotation.text gt_c += 1 X = [] Y = [] for id in trace_id: traces = soup.findAll("trace", {'id': id}) for trace in traces: coords = trace.text.strip().split(",") x = [] y = [] for coord in coords: trace_parts = coord.strip().split(' ') x.append(float(trace_parts[0])) y.append(float(trace_parts[1])) X.extend(x) Y.extend(y) X, Y = pre.dopreprocess(x=X, y=Y) ar = pre.get_aspect(X, Y) pen = len(trace_id) feature_matrix.append(extract_features(X, Y, pen, ar, key=gt)) completed += 1 df = pd.DataFrame(feature_matrix) print("Shape of Matrix ", df.shape, " Total Ground truths in file", gt_c) name = directory.strip().split("/")[0] df.to_csv(name + ".csv", index=False)
def perfectly_segmented_parser(ink_dir, bonus=False): """ This is a parser for perfectly segmented symbols :param ink_dir: inkml directory :param bonus: boolean for bonus :return: """ start = time.time() lg_dir = dir.strip().split("/")[0] + "_output_lg" if not os.path.exists(lg_dir): os.mkdir(lg_dir) ink_files = os.listdir(ink_dir) if bonus: print("Loaded Bonus classifier") clf = joblib.load("relation_classifier_bonus.pkl") else: print("Loaded relationship classifier") clf = joblib.load('relation_classifier4.pkl') pre = Preprocessing() total = len(ink_files) c = 0 gt_c = 0 for file in ink_files: print("Processing file : ", file, " Files remaining : ", total - c, " Files completed : ", c) f = open(os.path.join(ink_dir, file)) soup = bs.BeautifulSoup(f, 'html.parser') trace_groups = soup.find_all('tracegroup') symbol_list = [] #loop to isolate symbols for tracegroup in trace_groups[1:]: traceview = tracegroup.find_all('traceview') trace_id = [] #loop to get strokes in a single symbol for t in traceview: trace_id.append(t['tracedataref']) gt = tracegroup.annotation.text gt_c += 1 X = [] Y = [] #extract stroke coordinates for id in trace_id: traces = soup.findAll("trace", {'id': id}) for trace in traces: coords = trace.text.strip().split(",") x = [] y = [] for coord in coords: trace_parts = coord.strip().split(' ') x.append(float(trace_parts[0])) y.append(float(trace_parts[1])) X.append(x) Y.append(y) X, Y = pre.dopreprocess(x=X, y=Y, parser=True) if gt == ",": gt = "COMMA" sym_obj = Symbol(x=X, y=Y, label=gt, stroke_id=trace_id) symbol_list.append(sym_obj) symbol_count = {} #Run through list of symbols to get their count for sym in symbol_list: if sym.symbol not in symbol_count: symbol_count[sym.symbol] = 1 sym.sym_ct = symbol_count[sym.symbol] else: symbol_count[sym.symbol] += 1 sym.sym_ct = symbol_count[sym.symbol] #perform line of sight graph, labels = line_of_sight(symbol_list, clf) #run edmonds on los graph relations = edmonds(graph) #write result to lg write_to_lg(file=file, symbol_list=symbol_list, relations=relations, labels=labels, lg_dir=lg_dir) c += 1 print("System executed in ", (time.time() - start) / 60, " minutes.")
def train(ink_dir, lg_dir): """ This function is used for training model :param ink_dir: :param lg_dir: :return: """ lg_files = os.listdir(lg_dir) pre = Preprocessing() feature_matrix = [] targets = [] c = 0 total = len(lg_files) for file in lg_files: print(file, total - c, c) symbols = {} with open(lg_dir + "/" + file) as f: for line in f: if line.startswith("O"): filt_line = line.strip().split(",") symbols[filt_line[1].strip()] = [ filt_line[2], filt_line[4:] ] inkml_file = file.replace(".lg", ".inkml") with open(ink_dir + "/" + inkml_file) as f: soup = bs.BeautifulSoup(f, 'html.parser') for key in symbols: label = symbols[key][0] strokes = symbols[key][1] id_list = [] X = [] Y = [] for id in strokes: st_id = id.strip() trace = soup.findAll("trace", {'id': st_id}) coords = trace[0].text.strip().split(",") x = [] y = [] for coord in coords: trace_parts = coord.strip().split(' ') x.append(float(trace_parts[0])) y.append(float(trace_parts[1])) X.append(x) Y.append(y) id_list.append(st_id) X, Y = pre.dopreprocess(x=X, y=Y, parser=True) symbols[key] = Symbol(label=label, x=X, y=Y, stroke_id=id_list) # relations section with open(lg_dir + "/" + file) as f: for line in f: if line.startswith("EO"): filt_line = line.strip().split(",") sym1 = symbols[filt_line[1].strip()] sym2 = symbols[filt_line[2].strip()] relation = filt_line[3].strip() writing_slope = sym1.writing_slope(sym2) writing_curve = sym1.writing_curvature(sym2) bb_dist = sym1.distance_between_box(sym2) distance, horizontal_ofsset, vertical_distance = sym1.distance_between_average_centres( sym2) max_point_pair = sym1.maximal_point_distance(sym2) feature_matrix.append([ writing_slope, writing_curve, bb_dist, distance, horizontal_ofsset, vertical_distance, max_point_pair ]) targets.append(relation) c += 1 print("Shape of Training matrix") print(len(feature_matrix), "x", len(feature_matrix[0])) print("Unique labels : ", np.unique(targets)) rf = RandomForestClassifier(n_estimators=100, n_jobs=-1) rf.fit(X=feature_matrix, y=targets) joblib.dump(rf, "relation_classifier_bonus.pkl", protocol=pickle.HIGHEST_PROTOCOL) rf = joblib.load("relation_classifier_bonus.pkl") score = accuracy_score(y_true=targets, y_pred=rf.predict(feature_matrix), normalize=True) print("accuracy of model is :", (score * 100))