def main(): # validation log p = get_absolute_path('search/log/16-05-23_23-46_classification.log') co1, y_in1, y_out1, nt1, nc1, md1, v1, cpu1, ids1 = parse_log(p) plot_accuracy(co1, y_in1, nt1, cpu1) # example of how to filter for a specific point in the graph label, wid = filter_results(y_in1, co1, nt1, ids1, a_filt='<0.1', n_filt='>10') # training log p = get_absolute_path('search/log/16-05-23_23-38_classification.log') co2, y_in2, y_out2, nt2, nc2, md2, v2, cpu2, ids2 = parse_log(p) plot_accuracy(co2, y_in2, nt2, cpu2) label, wid = filter_results(y_in2, co2, nt2, ids2, a_filt='<0.01', l_filt='>14') plot_accuracy(np.append(co1, co2), np.append(y_in1, y_in2), np.append(nt1, nt2), np.append(cpu1, cpu2))
def __init__(self, word): self.config = get_config() self._document = Document() self.html = self._document.createElement('html') self.head = self._document.createElement('head') self.body = self._document.createElement('body') self.html.appendChild(self.head) self.html.appendChild(self.body) title = self._document.createElement('title') title.appendChild(self._document.createTextNode(word)) self.head.appendChild(title) empty_text = self._document.createTextNode('') css = self._document.createElement('link') css.setAttribute('rel', 'stylesheet') css.setAttribute('type', 'text/css') css.setAttribute( 'href', get_absolute_path(self.config.get('KWS.search', 'css'))) css.appendChild(empty_text) self.head.appendChild(css) js = self._document.createElement('script') js.setAttribute('type', 'text/javascript') js.setAttribute('src', get_absolute_path(self.config.get('KWS.search', 'js'))) js.appendChild(empty_text) self.head.appendChild(js)
def add_image_by_id(self, img_id, word_ids=None, all_tooltips=True): img_dir = get_absolute_path(self.config.get('KWS', 'images')) img_src = os.path.join(img_dir, img_id + '.jpg') svg_dir = get_absolute_path(self.config.get('KWS', 'locations')) svg_src = os.path.join(svg_dir, img_id + '.svg') self.add_image(img_src, svg_src, words=word_ids, img_id=img_id, all_tooltips=all_tooltips)
def run(): # Get parameter from the config file config = fio.get_config() kernels = str(config.get('SVM', 'kernels')).split(',') train_n = int(config.get('MNIST.sample.size', 'training')) test_n = int(config.get('MNIST.sample.size', 'testing')) # Read the data y_train, x_train = fio.parse_mnist(get_absolute_path( config.get('MNIST', 'trainingset')), numlines=train_n) train_n = y_train.shape[0] y_test, x_test = fio.parse_mnist(get_absolute_path( config.get('MNIST', 'testset')), numlines=test_n) test_n = y_test.shape[0] print('SVN on MNIST dataset') print(' training set:') print(' # samples %s' % train_n) print(' # classes: %s' % len(set(y_train))) print(' test set:') print(' # testing samples %s' % test_n) print(' # classes %s' % len(set(y_test))) # Test different kernels scores = [] for kernel in kernels: print(' kernel: %s' % kernel) score = test_kernel(x_train, y_train, x_test, y_test, kernel) print(' training score: %s' % score[0]) print(' training cross validation %s' % score[1]) print(' test score: %s' % score[2]) print(' test cross validation %s' % score[3]) scores.append(score) # plot the results df = DataFrame(np.array(scores).transpose(), columns=kernels) ax = df.plot.bar() ax.set_xticklabels([ 'train score', 'train: cross-val.', 'test: score', 'test: cross-val.' ], rotation=0) ax.set_title('SVM classification (N-training = %s, N-test = %s)' % (train_n, test_n)) ax.grid() ax.grid(which='minor') # ax.legend(loc=1) fig = ax.get_figure() path = fio.get_plot_file('SVM-scores') fig.savefig(path)
def main(): config = fio.get_config() # print("Config sections: %s" % config.sections()) # Load train set. csv_train_set_data = fio.import_csv_data( fio.get_absolute_path(config.get('MNIST', 'trainingset'))) #print("CSV train data length: %i" % len(csv_train_set_data)) #train_set_sample_data = fio.get_random_data_sample(csv_train_set_data, 2699) # Just load 10% random data while developing. train_set_lables, train_set_data = fio.split_labels_data( csv_train_set_data, 0) # Rescale. train_set_data = train_set_data / 255. print("Train data length: %i" % len(train_set_data)) # Load test set. csv_test_set_data = fio.import_csv_data( fio.get_absolute_path(config.get('MNIST', 'testset'))) print("Test data length: %i" % len(csv_test_set_data)) #test_set_sample_data = fio.get_random_data_sample(csv_test_set_data, 1501) # Just load 10% random data while developing. test_set_lables, test_set_data = fio.split_labels_data( csv_test_set_data, 0) # Rescale. test_set_data = test_set_data / 255. ## mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, ## algorithm='sgd', verbose=10, tol=1e-4, random_state=1) mlp = MLPClassifier(hidden_layer_sizes=(len(train_set_data) * 0.1, ), max_iter=30, alpha=1e-4, algorithm='sgd', verbose=10, tol=1e-4, random_state=1, learning_rate_init=.1) X = MinMaxScaler().fit_transform(train_set_data) mlp.fit(X, train_set_lables) print("Training set score: %f" % mlp.score(X, train_set_lables)) print("Training set loss: %f" % mlp.loss_) print("Test set score: %f" % mlp.score(test_set_data, test_set_lables)) # Load evaluation set. evaluation_set_data = fio.import_csv_data( fio.get_absolute_path(config.get('Evaluation.SVM', 'mnist'))) print("Evaluation data length: %i" % len(evaluation_set_data)) # Rescale. evaluation_set_data = evaluation_set_data / 255. predictions = mlp.predict(evaluation_set_data) export_predictions(predictions)
def main(): config = get_config() kernels = str(config.get('SVM', 'kernels')).split(',') outdir = get_absolute_path(config.get('Evaluation', 'output')) if not os.path.exists(outdir): os.mkdir(outdir) dat = parse() for param in kernels: name = 'svn_' + param + '_ts-26999' path = get_classifier_file(name) print('loading classifier %s' % name) clf = joblib.load(path) print('\tpredict...') lbls = clf.predict(dat) print('\twrite output...') filename = os.path.join(outdir, 'svm_' + param + '.csv') handle = open(filename, 'w+') for lbl in lbls: handle.write('%i\n' % lbl) handle.close()
def load_train_and_valid(self): """ Convenience method to load the training and validation data accoring to the specifications in the config.ini """ dataset = DataSet.parse() # self.parse() # get the training subset config = get_config() tp = get_absolute_path(config.get('KWS', 'testing')) dids = [] for line in open(tp, 'r'): did = line.strip() if len(did) == 3: dids.append(did) # create the index for the features index = np.array([dids.count(x.doc_id) == 1 for x in dataset.coords], dtype=bool) # Put the data in memory self.valid = DataSet(dataset.Y[index], dataset.imgs[index], dataset.X[index], dataset.coords[index]) index = ~index # (tested if the valid doc ids are indeed the complement) self.train = DataSet(dataset.Y[index], dataset.imgs[index], dataset.X[index], dataset.coords[index])
def run(): # Get parameter from the config file config = fio.get_config() kernels = str(config.get('SVM', 'kernels')).split(',') train_n = int(config.get('MNIST.sample.size', 'training')) test_n = int(config.get('MNIST.sample.size', 'testing')) # Read the data y_train, x_train = fio.parse_mnist(get_absolute_path(config.get('MNIST', 'trainingset')), numlines=train_n) train_n = y_train.shape[0] y_test, x_test = fio.parse_mnist(get_absolute_path(config.get('MNIST', 'testset')), numlines=test_n) test_n = y_test.shape[0] print('SVN on MNIST dataset') print(' training set:') print(' # samples %s' % train_n) print(' # classes: %s' % len(set(y_train))) print(' test set:') print(' # testing samples %s' % test_n) print(' # classes %s' % len(set(y_test))) # Test different kernels scores = [] for kernel in kernels: print(' kernel: %s' % kernel) score = test_kernel(x_train, y_train, x_test, y_test, kernel) print(' training score: %s' % score[0]) print(' training cross validation %s' % score[1]) print(' test score: %s' % score[2]) print(' test cross validation %s' % score[3]) scores.append(score) # plot the results df = DataFrame(np.array(scores).transpose(), columns=kernels) ax = df.plot.bar() ax.set_xticklabels(['train score', 'train: cross-val.', 'test: score', 'test: cross-val.'], rotation=0) ax.set_title('SVM classification (N-training = %s, N-test = %s)' % (train_n, test_n)) ax.grid() ax.grid(which='minor') # ax.legend(loc=1) fig = ax.get_figure() path = fio.get_plot_file('SVM-scores') fig.savefig(path)
def parse(): config = get_config() datapath = get_absolute_path(config.get('Evaluation.SVM', 'mnist')) dat = [] for line in open(datapath, 'r'): parts = line.strip().split(',') parts = [int(x) for x in parts] dat.append(parts) return np.array(dat)
def main(): config = fio.get_config() # print("Config sections: %s" % config.sections()) # Load train set. csv_train_set_data = fio.import_csv_data(fio.get_absolute_path(config.get('MNIST', 'trainingset'))) #print("CSV train data length: %i" % len(csv_train_set_data)) #train_set_sample_data = fio.get_random_data_sample(csv_train_set_data, 2699) # Just load 10% random data while developing. train_set_lables, train_set_data = fio.split_labels_data(csv_train_set_data, 0) # Rescale. train_set_data = train_set_data / 255. print("Train data length: %i" % len(train_set_data)) # Load test set. csv_test_set_data = fio.import_csv_data(fio.get_absolute_path(config.get('MNIST', 'testset'))) print("Test data length: %i" % len(csv_test_set_data)) #test_set_sample_data = fio.get_random_data_sample(csv_test_set_data, 1501) # Just load 10% random data while developing. test_set_lables, test_set_data = fio.split_labels_data(csv_test_set_data, 0) # Rescale. test_set_data = test_set_data / 255. ## mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, ## algorithm='sgd', verbose=10, tol=1e-4, random_state=1) mlp = MLPClassifier(hidden_layer_sizes=(len(train_set_data) * 0.1,), max_iter=30, alpha=1e-4, algorithm='sgd', verbose=10, tol=1e-4, random_state=1, learning_rate_init=.1) X = MinMaxScaler().fit_transform(train_set_data) mlp.fit(X, train_set_lables) print("Training set score: %f" % mlp.score(X, train_set_lables)) print("Training set loss: %f" % mlp.loss_) print("Test set score: %f" % mlp.score(test_set_data, test_set_lables)) # Load evaluation set. evaluation_set_data = fio.import_csv_data(fio.get_absolute_path(config.get('Evaluation.SVM', 'mnist'))) print("Evaluation data length: %i" % len(evaluation_set_data)) # Rescale. evaluation_set_data = evaluation_set_data / 255. predictions = mlp.predict(evaluation_set_data) export_predictions(predictions)
def write_images(): conf = get_config() odir = os.path.join(get_absolute_path(conf.get('Evaluation', 'output')), 'digits') if not os.path.exists(odir): os.mkdir(odir) vecs = parse() for i, vec in enumerate(vecs): d = np.math.sqrt(vec.shape[0]) r = vec.reshape([d, d]) n = i + 1 fn = os.path.join(odir, '%05d.jpg' % n) imsave(fn, r)
def parse(filename=None, items=None, id_filter=None): print('parse features') config = get_config() if filename is None: fmp = get_absolute_path(config.get('KWS.features', 'file')) else: fmp = get_absolute_path(filename) ids, imgs, mats = parse_feature_map(fmp, items=items, id_filter=id_filter) print('parse transcription') trans = get_transcription() words = [] coords = [] for coord in ids: word = get_word(coord, data=trans) words.append(str(word)) coords.append(WordCoord(coord)) return DataSet(np.array(words), imgs, mats, np.array(coords))
def __init__(self, word): self.config = get_config() self._document = Document() self.html = self._document.createElement('html') self.head = self._document.createElement('head') self.body = self._document.createElement('body') self.html.appendChild(self.head) self.html.appendChild(self.body) title = self._document.createElement('title') title.appendChild(self._document.createTextNode(word)) self.head.appendChild(title) empty_text = self._document.createTextNode('') css = self._document.createElement('link') css.setAttribute('rel', 'stylesheet') css.setAttribute('type', 'text/css') css.setAttribute('href', get_absolute_path(self.config.get('KWS.search', 'css'))) css.appendChild(empty_text) self.head.appendChild(css) js = self._document.createElement('script') js.setAttribute('type', 'text/javascript') js.setAttribute('src', get_absolute_path(self.config.get('KWS.search', 'js'))) js.appendChild(empty_text) self.head.appendChild(js)
def get_transcription(did=None): config = get_config() trans = [] path = get_absolute_path(config.get('KWS', 'transcription')) for line in open(path): parts = line.strip().split(' ') coord = WordCoord(parts[0]) if not did or coord.get_doc() == did: trans.append((coord, Word(parts[1]))) trans = sorted(trans, key=lambda x: x[0].__str__()) return trans
def create_log(self): config = get_config() cp = get_absolute_path(config.get('KWS.classifier', 'file')) cd = os.path.dirname(cp) if not os.path.exists(cd): os.mkdir(cd) self._log = os.path.join( cd, datetime.now().strftime('%y-%m-%d_%H-%M_') + os.path.basename(cp)) msg = 'Testing\nk=%i\nvertical tolerance=%i\nhorizontal tolerance=%i\n# training samples: %i\n' % \ (self._k, self._tol_v, self._tol_h, self.train.N) print(msg, end='') f = open(self._log, 'w+') f.write(msg)
def test(word, all_tooltips=True): """ Finds all locations of a word. Optionally the locations can be saved in an HTML file with the same name as the word. """ locations = [ '270-01-01', '274-01-01', '270-02-02', '273-01-01', '273-02-02', '273-05-01', '270-09-01' ] outputfile = get_absolute_path('search/vis/' + word + '.html') display_all_occurences(word, locations, output=outputfile, all_tooltips=all_tooltips) webbrowser.open('file://' + outputfile)
def search_word(self, word): """ Search for a word and display the results as html in default webbrowser """ if self.index is None: self.create_index() if word not in self.index: print('%s is not found' % word) return d = get_absolute_path('search/vis/') if not os.path.exists(d): os.mkdir(d) outputfile = os.path.join(d, word + '.html') display_all_occurences(word, self.index[word], output=outputfile) print('searched "%s", found %i occurrences' % (word, len(self.index[word]))) webbrowser.open('file://' + outputfile)
import numpy as np from ip import doc_processor from search.kws import DataSet from utils.fio import get_config, get_absolute_path from utils.transcription import Word, WordCoord from dtwextension import dtwdistance config = get_config() doc_processor.main(imgpath=config.get('Evaluation.KWS', 'images'), svgpath=config.get('Evaluation.KWS', 'svg'), outputfile=config.get('Evaluation.KWS', 'feature-map')) # parse the keywords kwp = get_absolute_path(config.get('Evaluation.KWS', 'keywords')) words = [] coords = [] for line in open(kwp, 'r'): parts = line.strip().split(',') words.append(Word(parts[0].strip())) coords.append(WordCoord(parts[1].strip())) # parse the feature maps train = DataSet.parse(get_absolute_path(config.get('KWS.features', 'file'))) evalu = DataSet.parse( get_absolute_path(config.get('Evaluation.KWS', 'feature-map'))) outputfile = os.path.join( get_absolute_path(config.get('Evaluation', 'output')), 'kws-dists.csv') handle = open(outputfile, 'w+')
def main(imgpath=None, svgpath=None, outputfile=None, retake=True, saveimgs=True): print('Word pre-processing') config = get_config() # create an output file if outputfile is None: txtp = get_absolute_path(config.get('KWS.features', 'file')) else: txtp = get_absolute_path(os.path.join(outputfile)) processed = [] if retake and os.path.exists(txtp): takenext = False for line in open(txtp, 'r'): line = line.strip() if takenext and (len(line) >= 9): processed.append(line.strip()) takenext = False elif line == "###": takenext = True else: handle = open(txtp, 'w+') for param, value in config.items('KWS.prepro'): handle.write('%s: %s%s' % (param, value, os.linesep)) for param, value in config.items('KWS.features'): handle.write('%s: %s%s' % (param, value, os.linesep)) handle.write('###' + os.linesep) handle.close() # get the data if svgpath is None: svgd = get_absolute_path(config.get('KWS', 'locations')) else: svgd = get_absolute_path(svgpath) svgs = glob(os.path.join(svgd, '*.svg')) if imgpath is None: imgd = get_absolute_path(config.get('KWS', 'images')) else: imgd = get_absolute_path(imgpath) imgs = glob(os.path.join(imgd, '*.jpg')) # parse some parameter threshold = float(config.get('KWS.prepro', 'segmentation_threshold')) relative_height = float(config.get('KWS.prepro', 'relative_height')) skew_resolution = float(config.get('KWS.prepro', 'angular_resolution')) primary_peak_height = float(config.get('KWS.prepro', 'primary_peak_height')) secondary_peak_height = float(config.get('KWS.prepro', 'secondary_peak_height')) window_width = int(config.get('KWS.features', 'window_width')) step_size = int(config.get('KWS.features', 'step_size')) blocks = int(config.get('KWS.features', 'number_of_blocks')) svgs.sort() imgs.sort() for svgp, imgp in zip(svgs, imgs): svgid = os.path.basename(svgp).replace('.svg', '') imgid = os.path.basename(imgp).replace('.jpg', '') print('\t%s\n\t%s' % (svgp, imgp)) if svgid != imgid: raise IOError('the id\'s of the image file (%s) and the svg file (%s) are not the same' % (svgid, imgid)) trans = get_transcription(svgid) print('\tdoc id: %s' % svgid) wids, paths = parse_svg(svgp) img = imread(imgp) for wid, path in zip(wids, paths): print('\tword id: %s' % wid) if retake and (processed.count(wid) == 1): print('\talready processed') continue # look up the corresponding word if saveimgs: imgfile = wid word = get_word(wid, data=trans) if word is not None: imgfile = word.code2string() + '_' + imgfile else: imgfile = None # get the word image poly = path2polygon(path) roi = crop(img, poly) pre, sym = word_preprocessor(roi, threshold=threshold, rel_height=relative_height, skew_res=skew_resolution, ppw=primary_peak_height, spw=secondary_peak_height, save=imgfile) if type(pre) is str: print('\tpre-processing failed\n\t\t%s' % pre) continue fea = compute_features(pre, window_width=window_width, step_size=step_size, blocks=blocks) write_word_features(txtp, wid, fea, [pre.shape[0], pre.shape[1], sym]) print('...')
from ip import doc_processor from search.kws import DataSet from utils.fio import get_config, get_absolute_path from utils.transcription import Word, WordCoord from dtwextension import dtwdistance config = get_config() doc_processor.main(imgpath=config.get('Evaluation.KWS', 'images'), svgpath=config.get('Evaluation.KWS', 'svg'), outputfile=config.get('Evaluation.KWS', 'feature-map')) # parse the keywords kwp = get_absolute_path(config.get('Evaluation.KWS', 'keywords')) words = [] coords = [] for line in open(kwp, 'r'): parts = line.strip().split(',') words.append(Word(parts[0].strip())) coords.append(WordCoord(parts[1].strip())) # parse the feature maps train = DataSet.parse(get_absolute_path(config.get('KWS.features', 'file'))) evalu = DataSet.parse(get_absolute_path(config.get('Evaluation.KWS', 'feature-map'))) outputfile = os.path.join(get_absolute_path(config.get('Evaluation', 'output')), 'kws-dists.csv') handle = open(outputfile, 'w+') for coord, word in zip(coords, words): index = np.array([x.id == coord.id for x in train.coords], dtype=bool)
def main(imgpath=None, svgpath=None, outputfile=None, retake=True, saveimgs=True): print('Word pre-processing') config = get_config() # create an output file if outputfile is None: txtp = get_absolute_path(config.get('KWS.features', 'file')) else: txtp = get_absolute_path(os.path.join(outputfile)) processed = [] if retake and os.path.exists(txtp): takenext = False for line in open(txtp, 'r'): line = line.strip() if takenext and (len(line) >= 9): processed.append(line.strip()) takenext = False elif line == "###": takenext = True else: handle = open(txtp, 'w+') for param, value in config.items('KWS.prepro'): handle.write('%s: %s%s' % (param, value, os.linesep)) for param, value in config.items('KWS.features'): handle.write('%s: %s%s' % (param, value, os.linesep)) handle.write('###' + os.linesep) handle.close() # get the data if svgpath is None: svgd = get_absolute_path(config.get('KWS', 'locations')) else: svgd = get_absolute_path(svgpath) svgs = glob(os.path.join(svgd, '*.svg')) if imgpath is None: imgd = get_absolute_path(config.get('KWS', 'images')) else: imgd = get_absolute_path(imgpath) imgs = glob(os.path.join(imgd, '*.jpg')) # parse some parameter threshold = float(config.get('KWS.prepro', 'segmentation_threshold')) relative_height = float(config.get('KWS.prepro', 'relative_height')) skew_resolution = float(config.get('KWS.prepro', 'angular_resolution')) primary_peak_height = float(config.get('KWS.prepro', 'primary_peak_height')) secondary_peak_height = float( config.get('KWS.prepro', 'secondary_peak_height')) window_width = int(config.get('KWS.features', 'window_width')) step_size = int(config.get('KWS.features', 'step_size')) blocks = int(config.get('KWS.features', 'number_of_blocks')) svgs.sort() imgs.sort() for svgp, imgp in zip(svgs, imgs): svgid = os.path.basename(svgp).replace('.svg', '') imgid = os.path.basename(imgp).replace('.jpg', '') print('\t%s\n\t%s' % (svgp, imgp)) if svgid != imgid: raise IOError( 'the id\'s of the image file (%s) and the svg file (%s) are not the same' % (svgid, imgid)) trans = get_transcription(svgid) print('\tdoc id: %s' % svgid) wids, paths = parse_svg(svgp) img = imread(imgp) for wid, path in zip(wids, paths): print('\tword id: %s' % wid) if retake and (processed.count(wid) == 1): print('\talready processed') continue # look up the corresponding word if saveimgs: imgfile = wid word = get_word(wid, data=trans) if word is not None: imgfile = word.code2string() + '_' + imgfile else: imgfile = None # get the word image poly = path2polygon(path) roi = crop(img, poly) pre, sym = word_preprocessor(roi, threshold=threshold, rel_height=relative_height, skew_res=skew_resolution, ppw=primary_peak_height, spw=secondary_peak_height, save=imgfile) if type(pre) is str: print('\tpre-processing failed\n\t\t%s' % pre) continue fea = compute_features(pre, window_width=window_width, step_size=step_size, blocks=blocks) write_word_features(txtp, wid, fea, [pre.shape[0], pre.shape[1], sym]) print('...')
wordcoord = WordCoord(c) roi = get_image_roi(wordcoord) if roi.shape[0] < (primary_peak_height * 0.5): h = roi.shape[0] print('\tskipping (image height is only %s)' % h) continue pre, sym = word_preprocessor(roi, threshold=threshold, rel_height=relative_height, skew_res=skew_resolution, ppw=primary_peak_height, spw=secondary_peak_height, show=False) sym = word_symmetry(pre, ppw=primary_peak_height, spw=secondary_peak_height, show=True) fea = compute_features(pre, window_width=window_width, step_size=step_size, blocks=blocks) # Read the feature map fpath = get_absolute_path('ip/feature-map.txt') a, b, c = parse_feature_map(fpath) print('')