Esempio n. 1
0
 def __init__(self, word):
     self.config = get_config()
     self._document = Document()
     self.html = self._document.createElement('html')
     self.head = self._document.createElement('head')
     self.body = self._document.createElement('body')
     self.html.appendChild(self.head)
     self.html.appendChild(self.body)
     title = self._document.createElement('title')
     title.appendChild(self._document.createTextNode(word))
     self.head.appendChild(title)
     empty_text = self._document.createTextNode('')
     css = self._document.createElement('link')
     css.setAttribute('rel', 'stylesheet')
     css.setAttribute('type', 'text/css')
     css.setAttribute(
         'href', get_absolute_path(self.config.get('KWS.search', 'css')))
     css.appendChild(empty_text)
     self.head.appendChild(css)
     js = self._document.createElement('script')
     js.setAttribute('type', 'text/javascript')
     js.setAttribute('src',
                     get_absolute_path(self.config.get('KWS.search', 'js')))
     js.appendChild(empty_text)
     self.head.appendChild(js)
Esempio n. 2
0
    def load_train_and_valid(self):
        """
        Convenience method to load the training and validation data accoring
        to the specifications in the config.ini
        """
        dataset = DataSet.parse()
        # self.parse()

        # get the training subset
        config = get_config()
        tp = get_absolute_path(config.get('KWS', 'testing'))
        dids = []
        for line in open(tp, 'r'):
            did = line.strip()
            if len(did) == 3:
                dids.append(did)

        # create the index for the features
        index = np.array([dids.count(x.doc_id) == 1 for x in dataset.coords],
                         dtype=bool)

        # Put the data in memory
        self.valid = DataSet(dataset.Y[index], dataset.imgs[index],
                             dataset.X[index], dataset.coords[index])
        index = ~index  # (tested if the valid doc ids are indeed the complement)
        self.train = DataSet(dataset.Y[index], dataset.imgs[index],
                             dataset.X[index], dataset.coords[index])
def main():
    config = get_config()
    kernels = str(config.get('SVM', 'kernels')).split(',')

    outdir = get_absolute_path(config.get('Evaluation', 'output'))
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    dat = parse()

    for param in kernels:
        name = 'svn_' + param + '_ts-26999'
        path = get_classifier_file(name)
        print('loading classifier %s' % name)
        clf = joblib.load(path)

        print('\tpredict...')
        lbls = clf.predict(dat)

        print('\twrite output...')
        filename = os.path.join(outdir, 'svm_' + param + '.csv')
        handle = open(filename, 'w+')
        for lbl in lbls:
            handle.write('%i\n' % lbl)

        handle.close()
def main():
    config = get_config()
    kernels = str(config.get('SVM', 'kernels')).split(',')

    outdir = get_absolute_path(config.get('Evaluation', 'output'))
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    dat = parse()

    for param in kernels:
        name = 'svn_' + param + '_ts-26999'
        path = get_classifier_file(name)
        print('loading classifier %s' % name)
        clf = joblib.load(path)

        print('\tpredict...')
        lbls = clf.predict(dat)

        print('\twrite output...')
        filename = os.path.join(outdir, 'svm_' + param + '.csv')
        handle = open(filename, 'w+')
        for lbl in lbls:
            handle.write('%i\n' % lbl)

        handle.close()
Esempio n. 5
0
def save_word_image(img, wid):
    config = get_config()
    plotdir = os.path.join(get_project_root_directory(),
                           config.get('Plots', 'directory'))
    plotfile = os.path.join(plotdir, wid + '.png')
    imsave(
        plotfile, img
    )  # TODO: there is a warning about precision loss (convert from float64 to uint16)
def parse():
    config = get_config()
    datapath = get_absolute_path(config.get('Evaluation.SVM', 'mnist'))
    dat = []
    for line in open(datapath, 'r'):
        parts = line.strip().split(',')
        parts = [int(x) for x in parts]
        dat.append(parts)

    return np.array(dat)
def parse():
    config = get_config()
    datapath = get_absolute_path(config.get('Evaluation.SVM', 'mnist'))
    dat = []
    for line in open(datapath, 'r'):
        parts = line.strip().split(',')
        parts = [int(x) for x in parts]
        dat.append(parts)

    return np.array(dat)
Esempio n. 8
0
def main():
    print("%s | Start running molecules_pipeline." % asctime())
    print(
        "================================================================================"
    )
    start = timer()

    #mol16 = molecule.Molecule("16")
    #print(mol16.get_id())
    #print("# nodes: %d" % (len(mol16.get_nodes())))
    #print("# edges: %d" % (len(mol16.get_edges())))

    #node1 = mol16.get_nodes()[0]
    #print("Node '%s' outdegree: %d" % (node1, node1.get_outdegree()))
    #print("Node '%s' indegree: %d" % (node1, node1.get_indegree()))

    #mol40 = molecule.Molecule("40")
    #print(mol40.get_id())

    #cost_matrix = bipartite_graph.build_cost_matrix(mol16, mol40)
    #print(cost_matrix)

    #row_ind, col_ind = bipartite_graph.get_optimal_assignment(cost_matrix)
    #print("%s; %s" % (row_ind, col_ind))

    #lsa_cost = bipartite_graph.get_assignment_cost(cost_matrix, row_ind, col_ind)
    #print(lsa_cost)

    config = fio.get_config()
    train_path = config.get('molecules', 'training')
    train_molecules, train_target_values = load_data(train_path)

    #test_path = config.get('molecules', 'testing')
    #test_molecules, test_target_values = load_data(test_path)
    #test_element = test_molecules[0]

    #knn_classifier = bipartite_graph.get_knn_classifier(train_molecules, train_target_values)
    #knn_result = bipartite_graph.get_k_nearest_neighbors(knn_classifier, test_element)

    #knn_result = bipartite_graph.get_k_nearest_neighbors(train_molecules, test_element)
    #label = bipartite_graph.determine_most_frequent_label(knn_result)
    #print("Predicted label: %s" % label)

    evaluation_path = config.get('molecules', 'evaluation')
    evaluation_molecules = load_evaluation_data(evaluation_path)

    run_evaluation(evaluation_molecules, train_molecules, 3)

    end = timer()
    print(
        "================================================================================"
    )
    print("Duration: %f" % (end - start))
    print("%s | End of molecules_pipeline." % asctime())
Esempio n. 9
0
 def __init__(self):
     config = get_config()
     self._k = int(config.get('KWS.classifier', 'k'))
     self._tol_v = int(config.get('KWS.classifier', 'tol_ver'))
     self._tol_h = int(config.get('KWS.classifier', 'tol_hor'))
     self._d = None
     self.train = None
     self.valid = None
     self.data = None
     self.index = None
     self._log = None
def export_predictions(predictions):
    config = fio.get_config()
    file_path = config.get('molecules', 'root')
    file_path += "/molecules_result.csv"
    data = []
    for element in predictions:
        molecule = element[0]
        label = element[1]
        data_element = [molecule.get_file_number(), label]
        data.append(data_element)
    data.sort(key=operator.itemgetter(0))
    fio.export_csv_data(file_path, data)
Esempio n. 11
0
def export_predictions(predictions):
    config = fio.get_config()
    file_path = config.get('molecules', 'root')
    file_path += "/molecules_result.csv"
    data = []
    for element in predictions:
        molecule = element[0]
        label = element[1]
        data_element = [molecule.get_file_number(), label]
        data.append(data_element)
    data.sort(key=operator.itemgetter(0))
    fio.export_csv_data(file_path, data)
Esempio n. 12
0
    def classify(self, mat, coord, img=None):
        """
        classify a single sample (mat).
        mat KxN is a feature matrix from an image. N features computed for K windows.
        If the tuple img (image height, image width, rank) is defined,
        the distance computation will be constrained by the image parameter.
        """
        t0 = time.time()

        config = get_config()
        prune = config.get('KWS.classifier', 'prune') == 'True'

        if img is None or not prune:
            x = self.train.X
            y = self.train.Y
            c = np.array([x.id for x in self.train.coords])
        else:
            # subset index
            h_min = img[0] - self._tol_v
            h_max = img[0] + self._tol_v
            w_min = img[1] - self._tol_h
            w_max = img[1] + self._tol_h

            i = ((self.train.h >= h_min) & (h_max >= self.train.h)) & \
                (self.train.w >= w_min) & (w_max >= self.train.h) & \
                (self.train.rank == img[2])
            # x = np.append([mat], self.train.X[i])
            x = self.train.X[i]
            y = self.train.Y[i]
            c = np.array([x.id for x in self.train.coords[i]])

        # default output ...
        lbl = '???'
        md = -1
        cnt = -1
        nc = x.shape[0]

        # ... because there are a lot of ways to fail
        if nc > 0:
            not_itself = coord != c
            # print(sum(~not_itself))
            x = x[not_itself]
            y = y[not_itself]
            nc = x.shape[0]
            if nc > 0:
                d = np.zeros((nc, ))
                for i, m in enumerate(x):
                    # d[i], _ = fastdtw(mat, m, dist=euclidean)
                    d[i] = dtwdistance(mat, m)

                lbl, md, cnt = self.vote(d, y)

        return lbl, nc, md, cnt, time.time() - t0
def main():
    print("%s | Start running molecules_pipeline." % asctime())
    print("================================================================================")
    start = timer()
    
    #mol16 = molecule.Molecule("16")
    #print(mol16.get_id())
    #print("# nodes: %d" % (len(mol16.get_nodes())))
    #print("# edges: %d" % (len(mol16.get_edges())))
    
    #node1 = mol16.get_nodes()[0]
    #print("Node '%s' outdegree: %d" % (node1, node1.get_outdegree()))
    #print("Node '%s' indegree: %d" % (node1, node1.get_indegree()))
    
    #mol40 = molecule.Molecule("40")
    #print(mol40.get_id())
    
    #cost_matrix = bipartite_graph.build_cost_matrix(mol16, mol40)
    #print(cost_matrix)
    
    #row_ind, col_ind = bipartite_graph.get_optimal_assignment(cost_matrix)
    #print("%s; %s" % (row_ind, col_ind))
    
    #lsa_cost = bipartite_graph.get_assignment_cost(cost_matrix, row_ind, col_ind)
    #print(lsa_cost)
    
    
    config = fio.get_config()
    train_path = config.get('molecules', 'training')
    train_molecules, train_target_values = load_data(train_path)
    
    #test_path = config.get('molecules', 'testing')
    #test_molecules, test_target_values = load_data(test_path)
    #test_element = test_molecules[0]
    
    #knn_classifier = bipartite_graph.get_knn_classifier(train_molecules, train_target_values)
    #knn_result = bipartite_graph.get_k_nearest_neighbors(knn_classifier, test_element)
    
    #knn_result = bipartite_graph.get_k_nearest_neighbors(train_molecules, test_element)
    #label = bipartite_graph.determine_most_frequent_label(knn_result)
    #print("Predicted label: %s" % label)
    
    
    evaluation_path = config.get('molecules', 'evaluation')
    evaluation_molecules = load_evaluation_data(evaluation_path)
    
    run_evaluation(evaluation_molecules, train_molecules, 3)
    
    end = timer()
    print("================================================================================")
    print("Duration: %f" % (end - start))
    print("%s | End of molecules_pipeline." % asctime())
def write_images():
    conf = get_config()
    odir = os.path.join(get_absolute_path(conf.get('Evaluation', 'output')), 'digits')
    if not os.path.exists(odir):
        os.mkdir(odir)

    vecs = parse()
    for i, vec in enumerate(vecs):
        d = np.math.sqrt(vec.shape[0])
        r = vec.reshape([d, d])
        n = i + 1
        fn = os.path.join(odir, '%05d.jpg' % n)
        imsave(fn, r)
Esempio n. 15
0
def main():
    config = fio.get_config()
    # print("Config sections: %s" % config.sections())

    # Load train set.
    csv_train_set_data = fio.import_csv_data(
        fio.get_absolute_path(config.get('MNIST', 'trainingset')))
    #print("CSV train data length: %i" % len(csv_train_set_data))
    #train_set_sample_data = fio.get_random_data_sample(csv_train_set_data, 2699) # Just load 10% random data while developing.
    train_set_lables, train_set_data = fio.split_labels_data(
        csv_train_set_data, 0)
    # Rescale.
    train_set_data = train_set_data / 255.
    print("Train data length: %i" % len(train_set_data))

    # Load test set.
    csv_test_set_data = fio.import_csv_data(
        fio.get_absolute_path(config.get('MNIST', 'testset')))
    print("Test data length: %i" % len(csv_test_set_data))
    #test_set_sample_data = fio.get_random_data_sample(csv_test_set_data, 1501) # Just load 10% random data while developing.
    test_set_lables, test_set_data = fio.split_labels_data(
        csv_test_set_data, 0)
    # Rescale.
    test_set_data = test_set_data / 255.

    ## mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
    ##                     algorithm='sgd', verbose=10, tol=1e-4, random_state=1)
    mlp = MLPClassifier(hidden_layer_sizes=(len(train_set_data) * 0.1, ),
                        max_iter=30,
                        alpha=1e-4,
                        algorithm='sgd',
                        verbose=10,
                        tol=1e-4,
                        random_state=1,
                        learning_rate_init=.1)
    X = MinMaxScaler().fit_transform(train_set_data)
    mlp.fit(X, train_set_lables)

    print("Training set score: %f" % mlp.score(X, train_set_lables))
    print("Training set loss: %f" % mlp.loss_)
    print("Test set score: %f" % mlp.score(test_set_data, test_set_lables))

    # Load evaluation set.
    evaluation_set_data = fio.import_csv_data(
        fio.get_absolute_path(config.get('Evaluation.SVM', 'mnist')))
    print("Evaluation data length: %i" % len(evaluation_set_data))
    # Rescale.
    evaluation_set_data = evaluation_set_data / 255.

    predictions = mlp.predict(evaluation_set_data)
    export_predictions(predictions)
def run():
    # Get parameter from the config file
    config = fio.get_config()
    kernels = str(config.get('SVM', 'kernels')).split(',')
    train_n = int(config.get('MNIST.sample.size', 'training'))
    test_n = int(config.get('MNIST.sample.size', 'testing'))

    # Read the data
    y_train, x_train = fio.parse_mnist(get_absolute_path(
        config.get('MNIST', 'trainingset')),
                                       numlines=train_n)
    train_n = y_train.shape[0]
    y_test, x_test = fio.parse_mnist(get_absolute_path(
        config.get('MNIST', 'testset')),
                                     numlines=test_n)
    test_n = y_test.shape[0]

    print('SVN on MNIST dataset')
    print('   training set:')
    print('      # samples %s' % train_n)
    print('      # classes: %s' % len(set(y_train)))
    print('   test set:')
    print('      # testing samples %s' % test_n)
    print('      # classes %s' % len(set(y_test)))

    # Test different kernels
    scores = []
    for kernel in kernels:
        print('   kernel: %s' % kernel)
        score = test_kernel(x_train, y_train, x_test, y_test, kernel)
        print('      training score: %s' % score[0])
        print('      training cross validation %s' % score[1])
        print('      test score: %s' % score[2])
        print('      test cross validation %s' % score[3])
        scores.append(score)

    # plot the results
    df = DataFrame(np.array(scores).transpose(), columns=kernels)
    ax = df.plot.bar()
    ax.set_xticklabels([
        'train score', 'train: cross-val.', 'test: score', 'test: cross-val.'
    ],
                       rotation=0)
    ax.set_title('SVM classification (N-training = %s, N-test = %s)' %
                 (train_n, test_n))
    ax.grid()
    ax.grid(which='minor')
    # ax.legend(loc=1)
    fig = ax.get_figure()
    path = fio.get_plot_file('SVM-scores')
    fig.savefig(path)
def write_images():
    conf = get_config()
    odir = os.path.join(get_absolute_path(conf.get('Evaluation', 'output')),
                        'digits')
    if not os.path.exists(odir):
        os.mkdir(odir)

    vecs = parse()
    for i, vec in enumerate(vecs):
        d = np.math.sqrt(vec.shape[0])
        r = vec.reshape([d, d])
        n = i + 1
        fn = os.path.join(odir, '%05d.jpg' % n)
        imsave(fn, r)
def get_transcription(did=None):
    config = get_config()
    trans = []
    path = get_absolute_path(config.get('KWS', 'transcription'))
    for line in open(path):
        parts = line.strip().split(' ')
        coord = WordCoord(parts[0])

        if not did or coord.get_doc() == did:
            trans.append((coord, Word(parts[1])))

    trans = sorted(trans, key=lambda x: x[0].__str__())

    return trans
Esempio n. 19
0
    def create_log(self):
        config = get_config()
        cp = get_absolute_path(config.get('KWS.classifier', 'file'))
        cd = os.path.dirname(cp)
        if not os.path.exists(cd):
            os.mkdir(cd)

        self._log = os.path.join(
            cd,
            datetime.now().strftime('%y-%m-%d_%H-%M_') + os.path.basename(cp))
        msg = 'Testing\nk=%i\nvertical tolerance=%i\nhorizontal tolerance=%i\n# training samples: %i\n' % \
              (self._k, self._tol_v, self._tol_h, self.train.N)
        print(msg, end='')
        f = open(self._log, 'w+')
        f.write(msg)
def run():
    # Get parameter from the config file
    config = fio.get_config()
    kernels = str(config.get('SVM', 'kernels')).split(',')
    train_n = int(config.get('MNIST.sample.size', 'training'))
    test_n = int(config.get('MNIST.sample.size', 'testing'))

    # Read the data
    y_train, x_train = fio.parse_mnist(get_absolute_path(config.get('MNIST', 'trainingset')), numlines=train_n)
    train_n = y_train.shape[0]
    y_test, x_test = fio.parse_mnist(get_absolute_path(config.get('MNIST', 'testset')), numlines=test_n)
    test_n = y_test.shape[0]

    print('SVN on MNIST dataset')
    print('   training set:')
    print('      # samples %s' % train_n)
    print('      # classes: %s' % len(set(y_train)))
    print('   test set:')
    print('      # testing samples %s' % test_n)
    print('      # classes %s' % len(set(y_test)))

    # Test different kernels
    scores = []
    for kernel in kernels:
        print('   kernel: %s' % kernel)
        score = test_kernel(x_train, y_train, x_test, y_test, kernel)
        print('      training score: %s' % score[0])
        print('      training cross validation %s' % score[1])
        print('      test score: %s' % score[2])
        print('      test cross validation %s' % score[3])
        scores.append(score)

    # plot the results
    df = DataFrame(np.array(scores).transpose(), columns=kernels)
    ax = df.plot.bar()
    ax.set_xticklabels(['train score', 'train: cross-val.', 'test: score', 'test: cross-val.'], rotation=0)
    ax.set_title('SVM classification (N-training = %s, N-test = %s)' % (train_n, test_n))
    ax.grid()
    ax.grid(which='minor')
    # ax.legend(loc=1)
    fig = ax.get_figure()
    path = fio.get_plot_file('SVM-scores')
    fig.savefig(path)
def main():
    config = fio.get_config()
    # print("Config sections: %s" % config.sections())

    # Load train set.
    csv_train_set_data = fio.import_csv_data(fio.get_absolute_path(config.get('MNIST', 'trainingset')))
    #print("CSV train data length: %i" % len(csv_train_set_data))
    #train_set_sample_data = fio.get_random_data_sample(csv_train_set_data, 2699) # Just load 10% random data while developing.
    train_set_lables, train_set_data = fio.split_labels_data(csv_train_set_data, 0)
    # Rescale.
    train_set_data = train_set_data / 255.
    print("Train data length: %i" % len(train_set_data))

    # Load test set.
    csv_test_set_data = fio.import_csv_data(fio.get_absolute_path(config.get('MNIST', 'testset')))
    print("Test data length: %i" % len(csv_test_set_data))
    #test_set_sample_data = fio.get_random_data_sample(csv_test_set_data, 1501) # Just load 10% random data while developing.
    test_set_lables, test_set_data = fio.split_labels_data(csv_test_set_data, 0)
    # Rescale.
    test_set_data = test_set_data / 255.


    ## mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
    ##                     algorithm='sgd', verbose=10, tol=1e-4, random_state=1)
    mlp = MLPClassifier(hidden_layer_sizes=(len(train_set_data) * 0.1,), max_iter=30, alpha=1e-4,
                        algorithm='sgd', verbose=10, tol=1e-4, random_state=1,
                        learning_rate_init=.1)
    X = MinMaxScaler().fit_transform(train_set_data)
    mlp.fit(X, train_set_lables)
    
    print("Training set score: %f" % mlp.score(X, train_set_lables))
    print("Training set loss: %f" % mlp.loss_)
    print("Test set score: %f" % mlp.score(test_set_data, test_set_lables))
    
    # Load evaluation set.
    evaluation_set_data = fio.import_csv_data(fio.get_absolute_path(config.get('Evaluation.SVM', 'mnist')))
    print("Evaluation data length: %i" % len(evaluation_set_data))
    # Rescale.
    evaluation_set_data = evaluation_set_data / 255.
    
    predictions = mlp.predict(evaluation_set_data)
    export_predictions(predictions)
def main():
    """
    collection of examples to use the different modules and functions
    """

    # Get the configurations
    config = fio.get_config()
    print('Config sections: %s\n' % config.sections())

    # Read KWS transcription
    trans = get_transcription()
    c = trans[0][0]
    s = trans[0][1]
    print(
        'Transcription code:\n\tid: %s\tdoc-id: %s\tline-id: %s\tword-id:%s\n\tcode: %s\tstring: %s\n'
        % (c, c.get_doc(), c.get_line(), c.get_word(), s.get_word_code(), s))

    # Read text data
    print('Reading MNIST data')
    x, y = fio.parse_mnist(config.get('MNIST', 'testset'), 100)
    print('   parsed %s lines\n' % x.shape[0])

    print('Read csv data')
    csv_data = fio.import_csv_data(config.get('MNIST', 'testset'))
    labels, data = fio.split_labels_data(csv_data, 0)
    print('   Labels length: %i' % len(labels))
    print('   Data length: %i' % len(data))
    sample_data = fio.get_random_data_sample(data, 100)
    print('   Sample data length: %i' % len(sample_data))
    print('   Sample data type: %s\n' % type(sample_data))

    # Get a path for a (internal) plot file
    pp = fio.get_plot_file('test')
    print('Get a plot path:\n\t%s\n' % pp)

    # SVM test
    test_svm.run()

    # MLP test
    mlp_main.main()
def main():
    """
    collection of examples to use the different modules and functions
    """

    # Get the configurations
    config = fio.get_config()
    print('Config sections: %s\n' % config.sections())

    # Read KWS transcription
    trans = get_transcription()
    c = trans[0][0]
    s = trans[0][1]
    print('Transcription code:\n\tid: %s\tdoc-id: %s\tline-id: %s\tword-id:%s\n\tcode: %s\tstring: %s\n' %
          (c, c.get_doc(), c.get_line(), c.get_word(), s.get_word_code(), s))

    # Read text data
    print('Reading MNIST data')
    x, y = fio.parse_mnist(config.get('MNIST', 'testset'), 100)
    print('   parsed %s lines\n' % x.shape[0])

    print('Read csv data')
    csv_data = fio.import_csv_data(config.get('MNIST', 'testset'))
    labels, data = fio.split_labels_data(csv_data, 0)
    print('   Labels length: %i' % len(labels))
    print('   Data length: %i' % len(data))
    sample_data = fio.get_random_data_sample(data, 100)
    print('   Sample data length: %i' % len(sample_data))
    print('   Sample data type: %s\n' % type(sample_data))

    # Get a path for a (internal) plot file
    pp = fio.get_plot_file('test')
    print('Get a plot path:\n\t%s\n' % pp)

    # SVM test
    test_svm.run()

    # MLP test
    mlp_main.main()
Esempio n. 24
0
 def __init__(self, word):
     self.config = get_config()
     self._document = Document()
     self.html = self._document.createElement('html')
     self.head = self._document.createElement('head')
     self.body = self._document.createElement('body')
     self.html.appendChild(self.head)
     self.html.appendChild(self.body)
     title = self._document.createElement('title')
     title.appendChild(self._document.createTextNode(word))
     self.head.appendChild(title)
     empty_text = self._document.createTextNode('')
     css = self._document.createElement('link')
     css.setAttribute('rel', 'stylesheet')
     css.setAttribute('type', 'text/css')
     css.setAttribute('href', get_absolute_path(self.config.get('KWS.search', 'css')))
     css.appendChild(empty_text)
     self.head.appendChild(css)
     js = self._document.createElement('script')
     js.setAttribute('type', 'text/javascript')
     js.setAttribute('src', get_absolute_path(self.config.get('KWS.search', 'js')))
     js.appendChild(empty_text)
     self.head.appendChild(js)
Esempio n. 25
0
    def parse(filename=None, items=None, id_filter=None):
        print('parse features')
        config = get_config()

        if filename is None:
            fmp = get_absolute_path(config.get('KWS.features', 'file'))
        else:
            fmp = get_absolute_path(filename)

        ids, imgs, mats = parse_feature_map(fmp,
                                            items=items,
                                            id_filter=id_filter)

        print('parse transcription')
        trans = get_transcription()
        words = []
        coords = []
        for coord in ids:
            word = get_word(coord, data=trans)
            words.append(str(word))
            coords.append(WordCoord(coord))

        return DataSet(np.array(words), imgs, mats, np.array(coords))
Esempio n. 26
0
def compute_central_heights():
    config = get_config()
    rh = float(config.get('KWS.prepro', 'relative_height'))
    trc = get_transcription()
    wh = []
    for coord, word in trc:
        roi = get_image_roi(coord)
        msk = create_word_mask(roi)
        img = np.copy(roi)
        img = img.max() - img
        img[msk < 1] = 0
        y = img.sum(1)
        y_max = y.max()
        y[y < y_max * rh] = 0
        nz = y.nonzero()
        wh.append(nz[0].shape[0])

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.hist(wh, 30, normed=True)
    plt.xlabel('central peak heights of vertical word projection')
    plt.show()

    return np.array(wh)
Esempio n. 27
0
def export_predictions(predictions):
    config = fio.get_config()
    file_path = "./evaluation/mnist_mlp_result.csv"
    fio.export_csv_data(file_path, predictions)
def main(imgpath=None, svgpath=None, outputfile=None, retake=True, saveimgs=True):
    print('Word pre-processing')
    config = get_config()

    # create an output file
    if outputfile is None:
        txtp = get_absolute_path(config.get('KWS.features', 'file'))
    else:
        txtp = get_absolute_path(os.path.join(outputfile))

    processed = []
    if retake and os.path.exists(txtp):
        takenext = False
        for line in open(txtp, 'r'):
            line = line.strip()
            if takenext and (len(line) >= 9):
                processed.append(line.strip())
                takenext = False
            elif line == "###":
               takenext = True
    else:
        handle = open(txtp, 'w+')
        for param, value in config.items('KWS.prepro'):
            handle.write('%s: %s%s' % (param, value, os.linesep))
        for param, value in config.items('KWS.features'):
            handle.write('%s: %s%s' % (param, value, os.linesep))
        handle.write('###' + os.linesep)
        handle.close()

    # get the data
    if svgpath is None:
        svgd = get_absolute_path(config.get('KWS', 'locations'))
    else:
        svgd = get_absolute_path(svgpath)
    svgs = glob(os.path.join(svgd, '*.svg'))

    if imgpath is None:
        imgd = get_absolute_path(config.get('KWS', 'images'))
    else:
        imgd = get_absolute_path(imgpath)
    imgs = glob(os.path.join(imgd, '*.jpg'))

    # parse some parameter
    threshold = float(config.get('KWS.prepro', 'segmentation_threshold'))
    relative_height = float(config.get('KWS.prepro', 'relative_height'))
    skew_resolution = float(config.get('KWS.prepro', 'angular_resolution'))
    primary_peak_height = float(config.get('KWS.prepro', 'primary_peak_height'))
    secondary_peak_height = float(config.get('KWS.prepro', 'secondary_peak_height'))
    window_width = int(config.get('KWS.features', 'window_width'))
    step_size = int(config.get('KWS.features', 'step_size'))
    blocks = int(config.get('KWS.features', 'number_of_blocks'))
    svgs.sort()
    imgs.sort()

    for svgp, imgp in zip(svgs, imgs):
        svgid = os.path.basename(svgp).replace('.svg', '')
        imgid = os.path.basename(imgp).replace('.jpg', '')
        print('\t%s\n\t%s' % (svgp, imgp))

        if svgid != imgid:
            raise IOError('the id\'s of the image file (%s) and the svg file (%s) are not the same' % (svgid, imgid))

        trans = get_transcription(svgid)

        print('\tdoc id: %s' % svgid)
        wids, paths = parse_svg(svgp)
        img = imread(imgp)
        for wid, path in zip(wids, paths):
            print('\tword id: %s' % wid)

            if retake and (processed.count(wid) == 1):
                print('\talready processed')
                continue

            # look up the corresponding word
            if saveimgs:
                imgfile = wid
                word = get_word(wid, data=trans)
                if word is not None:
                    imgfile = word.code2string() + '_' + imgfile
            else:
                imgfile = None

            # get the word image
            poly = path2polygon(path)
            roi = crop(img, poly)

            pre, sym = word_preprocessor(roi,
                                         threshold=threshold,
                                         rel_height=relative_height,
                                         skew_res=skew_resolution,
                                         ppw=primary_peak_height,
                                         spw=secondary_peak_height,
                                         save=imgfile)

            if type(pre) is str:
                print('\tpre-processing failed\n\t\t%s' % pre)
                continue

            fea = compute_features(pre,
                                   window_width=window_width,
                                   step_size=step_size,
                                   blocks=blocks)

            write_word_features(txtp, wid, fea, [pre.shape[0], pre.shape[1], sym])
            print('...')
Esempio n. 29
0
def main(imgpath=None,
         svgpath=None,
         outputfile=None,
         retake=True,
         saveimgs=True):
    print('Word pre-processing')
    config = get_config()

    # create an output file
    if outputfile is None:
        txtp = get_absolute_path(config.get('KWS.features', 'file'))
    else:
        txtp = get_absolute_path(os.path.join(outputfile))

    processed = []
    if retake and os.path.exists(txtp):
        takenext = False
        for line in open(txtp, 'r'):
            line = line.strip()
            if takenext and (len(line) >= 9):
                processed.append(line.strip())
                takenext = False
            elif line == "###":
                takenext = True
    else:
        handle = open(txtp, 'w+')
        for param, value in config.items('KWS.prepro'):
            handle.write('%s: %s%s' % (param, value, os.linesep))
        for param, value in config.items('KWS.features'):
            handle.write('%s: %s%s' % (param, value, os.linesep))
        handle.write('###' + os.linesep)
        handle.close()

    # get the data
    if svgpath is None:
        svgd = get_absolute_path(config.get('KWS', 'locations'))
    else:
        svgd = get_absolute_path(svgpath)
    svgs = glob(os.path.join(svgd, '*.svg'))

    if imgpath is None:
        imgd = get_absolute_path(config.get('KWS', 'images'))
    else:
        imgd = get_absolute_path(imgpath)
    imgs = glob(os.path.join(imgd, '*.jpg'))

    # parse some parameter
    threshold = float(config.get('KWS.prepro', 'segmentation_threshold'))
    relative_height = float(config.get('KWS.prepro', 'relative_height'))
    skew_resolution = float(config.get('KWS.prepro', 'angular_resolution'))
    primary_peak_height = float(config.get('KWS.prepro',
                                           'primary_peak_height'))
    secondary_peak_height = float(
        config.get('KWS.prepro', 'secondary_peak_height'))
    window_width = int(config.get('KWS.features', 'window_width'))
    step_size = int(config.get('KWS.features', 'step_size'))
    blocks = int(config.get('KWS.features', 'number_of_blocks'))
    svgs.sort()
    imgs.sort()

    for svgp, imgp in zip(svgs, imgs):
        svgid = os.path.basename(svgp).replace('.svg', '')
        imgid = os.path.basename(imgp).replace('.jpg', '')
        print('\t%s\n\t%s' % (svgp, imgp))

        if svgid != imgid:
            raise IOError(
                'the id\'s of the image file (%s) and the svg file (%s) are not the same'
                % (svgid, imgid))

        trans = get_transcription(svgid)

        print('\tdoc id: %s' % svgid)
        wids, paths = parse_svg(svgp)
        img = imread(imgp)
        for wid, path in zip(wids, paths):
            print('\tword id: %s' % wid)

            if retake and (processed.count(wid) == 1):
                print('\talready processed')
                continue

            # look up the corresponding word
            if saveimgs:
                imgfile = wid
                word = get_word(wid, data=trans)
                if word is not None:
                    imgfile = word.code2string() + '_' + imgfile
            else:
                imgfile = None

            # get the word image
            poly = path2polygon(path)
            roi = crop(img, poly)

            pre, sym = word_preprocessor(roi,
                                         threshold=threshold,
                                         rel_height=relative_height,
                                         skew_res=skew_resolution,
                                         ppw=primary_peak_height,
                                         spw=secondary_peak_height,
                                         save=imgfile)

            if type(pre) is str:
                print('\tpre-processing failed\n\t\t%s' % pre)
                continue

            fea = compute_features(pre,
                                   window_width=window_width,
                                   step_size=step_size,
                                   blocks=blocks)

            write_word_features(txtp, wid, fea,
                                [pre.shape[0], pre.shape[1], sym])
            print('...')
def save_word_image(img, wid):
    config = get_config()
    plotdir = os.path.join(get_project_root_directory(), config.get('Plots', 'directory'))
    plotfile = os.path.join(plotdir, wid + '.png')
    imsave(plotfile, img) # TODO: there is a warning about precision loss (convert from float64 to uint16)
import os
import numpy as np

from ip import doc_processor
from search.kws import DataSet
from utils.fio import get_config, get_absolute_path
from utils.transcription import Word, WordCoord
from dtwextension import dtwdistance

config = get_config()

doc_processor.main(imgpath=config.get('Evaluation.KWS', 'images'),
                   svgpath=config.get('Evaluation.KWS', 'svg'),
                   outputfile=config.get('Evaluation.KWS', 'feature-map'))


# parse the keywords
kwp = get_absolute_path(config.get('Evaluation.KWS', 'keywords'))
words = []
coords = []
for line in open(kwp, 'r'):
    parts = line.strip().split(',')
    words.append(Word(parts[0].strip()))
    coords.append(WordCoord(parts[1].strip()))

# parse the feature maps
train = DataSet.parse(get_absolute_path(config.get('KWS.features', 'file')))
evalu = DataSet.parse(get_absolute_path(config.get('Evaluation.KWS', 'feature-map')))

outputfile = os.path.join(get_absolute_path(config.get('Evaluation', 'output')), 'kws-dists.csv')
handle = open(outputfile, 'w+')
Esempio n. 32
0
 def load_input_file(self):
     # Get the configurations
     config = fio.get_config()
     images_path = config.get('molecules', 'images')
     self.file_path = Path(images_path + "/" + self.file_number + ".gxl").__str__()
     self.parse_gxl_file()
Esempio n. 33
0
import os
import numpy as np

from ip import doc_processor
from search.kws import DataSet
from utils.fio import get_config, get_absolute_path
from utils.transcription import Word, WordCoord
from dtwextension import dtwdistance

config = get_config()

doc_processor.main(imgpath=config.get('Evaluation.KWS', 'images'),
                   svgpath=config.get('Evaluation.KWS', 'svg'),
                   outputfile=config.get('Evaluation.KWS', 'feature-map'))

# parse the keywords
kwp = get_absolute_path(config.get('Evaluation.KWS', 'keywords'))
words = []
coords = []
for line in open(kwp, 'r'):
    parts = line.strip().split(',')
    words.append(Word(parts[0].strip()))
    coords.append(WordCoord(parts[1].strip()))

# parse the feature maps
train = DataSet.parse(get_absolute_path(config.get('KWS.features', 'file')))
evalu = DataSet.parse(
    get_absolute_path(config.get('Evaluation.KWS', 'feature-map')))

outputfile = os.path.join(
    get_absolute_path(config.get('Evaluation', 'output')), 'kws-dists.csv')
def export_predictions(predictions):
    config = fio.get_config()
    file_path = "./evaluation/mnist_mlp_result.csv"
    fio.export_csv_data(file_path, predictions)