Exemple #1
0
    def _parse_function_inference(self, filename, label):
        """Input parser for samples of the validation/test set."""
        # convert label number into one-hot-encoding
        one_hot = tf.one_hot(label, self.num_classes)

        tool = victorinox()
        # load and preprocess the image
        img_string = tf.read_file(filename)
        #img_decoded = tf.image.decode_png(img_string)
        img_decoded = tf.image.decode_jpeg(img_string, channels=3)
        # h = img_decoded.get_shape().as_list()[0]
        # if h is None:
        #     h = 227
        # w = img_decoded.get_shape().as_list()[crossval1.old]
        # if w is None:
        #     w = 227
        # h, w = tool.get_convolvable_dim(h, w)
        # img_resized = tf.image.resize_images(img_decoded, [h, w])
        img_resized = tf.image.resize_images(img_decoded,
                                             [RESIZE_DIM[0], RESIZE_DIM[1]])
        #img_centered = tf.subtract(img_resized, IMAGENET_MEAN)
        #img_centered = tf.subtract(img_decoded, IMAGENET_MEAN)
        img_centered = tf.subtract(
            img_resized, self.mean_pixels)  #OCRD_RELOC_UNRELOC_227_MEAN)

        # RGB -> BGR
        img_bgr = img_centered[:, :, ::-1]
        #img_bgr = img_decoded[:, :, ::-crossval1.old]
        #img_bgr = img_resized[:, :, ::-crossval1.old]

        return img_bgr, one_hot
Exemple #2
0
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import os
import pandas as pd
import re
import PyPDF2
from collections import Counter
import math
import random

ct = CRFTagger()
ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')

tool = victorinox()
population1_dict = {}
population2_dict = {}
population_root_path = r"corpus/population"
population_files = glob(os.path.join(population_root_path, "**/*.txt"),
                        recursive=True)
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
factory = StemmerFactory()
stemmer = factory.create_stemmer()
default_stopwords = StopWordRemoverFactory().get_stop_words()
additional_stopwords = [
    "(", ")", "senin", "selasa", "rabu", "kamis", "jumat", "sabtu", "minggu"
]
dictionary = ArrayDictionary(default_stopwords + additional_stopwords)
id_stopword = StopWordRemover(dictionary)
en_stopword = set(stopwords.words('english'))
Exemple #3
0
 def classify_by_knn_tensorflow(self, csv_train, csv_test, k, batch,
                                result_csv):  #X_t, y_t, x_t, k_t):
     dftrain = pd.read_csv(csv_train, sep=",", header=None)
     dftest = pd.read_csv(csv_test, sep=",", header=None)
     npdtrain = np.array(dftrain)  #[:10])
     npdtest = np.array(dftest)  #[:5])
     X_t = npdtrain[:, :-1]
     y_t = npdtrain[:, -1]
     x_t = npdtest[:, :-1]
     gt_label = npdtest[:, -1]
     k_t = k
     X_t_shape = np.shape(X_t)
     x_t_shape = np.shape(x_t)
     neg_one = tf.constant(-1.0, dtype=tf.float32)
     self.Xtrain_ph = tf.placeholder(dtype=tf.float32,
                                     shape=[X_t_shape[0], 48],
                                     name="Xtrain")  #X_t_shape[1]
     self.Xtest_ph = tf.placeholder(dtype=tf.float32,
                                    shape=[48],
                                    name="Xtest")  #x_t_shape[1]
     self.save_batch = tf.placeholder(dtype=tf.int32, name="save_batch")
     self.k_neighbor = tf.placeholder(dtype=tf.int32, name="k_neighbor")
     with open(result_csv, "a+") as f:
         pass
     results = []
     times = []
     distances = self.get_average_of_JensenShannon_using_tensorflow(
         batch=self.save_batch, Xtrains=self.Xtrain_ph, Xtest=self.Xtest_ph)
     # to find the nearest points, we find the farthest points based on negative distances
     # we need this trick because tensorflow has top_k api and no closest_k or reverse=True api
     neg_distances = tf.multiply(distances, neg_one)
     # get the indices
     vals, indx = tf.nn.top_k(neg_distances, self.k_neighbor)
     # slice the labels of these points
     y_s = tf.gather(y_t, indx)
     # we compute the L-1 distance
     #distances = tf.reduce_sum(tf.abs(tf.subtract(X_t, x_t)), 1)
     for i in range(len(x_t)):
         x_test = x_t[i]
         gtlabel = gt_label[i]
         config = tf.ConfigProto(allow_soft_placement=True)
         with tf.Session(config=config) as sess:
             #x_test=tf.reshape(x_test,shape=[48])
             begin = time.time()
             out = sess.run(y_s,
                            feed_dict={
                                self.Xtrain_ph: X_t,
                                self.Xtest_ph: x_test,
                                self.save_batch: batch,
                                self.k_neighbor: k_t
                            })
             end = time.time()
             elapsed = end - begin
             print "TIME=%f" % (elapsed)
             print "GT/predict = %s/%s %r" % (gtlabel, out, gtlabel == out)
             results.append([gtlabel, out[0]])
             times.append(elapsed)
             #print self.get_label(out)
             # config = tf.ConfigProto(allow_soft_placement=True)
             # with tf.Session(config=config) as sess:
             #     out=sess.run(y_s)
             #     print self.get_label(out)
     tool = victorinox()
     npres = np.array(results)
     tool.calculate_f1_measure(npres[:, 0], npres[:, -1])
     head, tail = os.path.split(result_csv)
     times_csv = os.path.join(head, "times.csv")
     if len(results) > 0:
         np.savetxt(result_csv, results, fmt="%s", delimiter=",")
     if len(times) > 0:
         np.savetxt(times_csv, times, fmt="%s", delimiter=",")
     return  #y_s
Exemple #4
0
times_csv = "/home/andri/Documents/s2/5/master_arbeit/dataset/uw3/unrelocated/augmented/times.csv"

ocrd_aug_all = "/home/andri/Documents/s2/5/master_arbeit/dataset/ocrd/unrelocated/227/augmented.csv"
ocrd_aug_train = "/home/andri/Documents/s2/5/master_arbeit/dataset/ocrd/unrelocated/227/augmented/train.csv"
ocrd_aug_test = "/home/andri/Documents/s2/5/master_arbeit/dataset/ocrd/unrelocated/227/augmented/test.csv"
ocrd_aug_rlbwxyh_train = "ocrd_aug_rlbwxh_train.csv"
ocrd_aug_rlbwxyh_test = "ocrd_aug_rlbwxh_test.csv"
ocrd_aug_rlbwxyh_classification_result_csv = "ocrd_aug_rlbwxh_classification_result.csv"
ocrd_aug_rlbwxyh_classification_times_csv = "ocrd_aug_rlbwxh_classification_times.csv"

# g.convert_images_to_rlbwxyh_from_file_csv(img_folder_csv=ocrd_aug_test,#ocrd_aug_train,#uw3_aug_test,#uw3_aug_train,
#                                 csv_path=ocrd_aug_rlbwxyh_test,#ocrd_aug_rlbwxyh_train,#uw3_aug_rlbwxyh_test,#uw3_aug_rlbwxyh_train,#uw3_aug_rlbwxh_result,
#                                 text_resume_idx=103980,#71750,#0,#415850,#358040,#309090,#73310,#0,#9600,#0,#428100,#413100,#337600,#92400,#0,          #312100,#233100,#197300,#21300,#0,###################519800,#508500,#326900,#181700,#0,
#                                 save_batch=1)#10)#10)#00)#1)#

vic = victorinox()
uw3_dict = vic.convert_csv_to_dictionary(uw3_dct_csv)
ocrd_dict = vic.convert_csv_to_dictionary(ocrd_dct_csv)

# dff=pd.read_csv(uw3_rlbwxyh_classification_result_csv,sep=",",header=None)
dff = pd.read_csv(ocrd_aug_rlbwxyh_classification_result_csv,
                  sep=",",
                  header=None)
npd = np.array(dff)
resume_idx = len(npd)
# knn.classifyKNN_by_rlbwxyh_by_train_val_csv(train_csv=uw3_aug_rlbwxyh_train,#uw3_aug_rlbwxyh,#
#                            test_csv=uw3_aug_rlbwxyh_test,
#                            class_dict=uw3_dict,
#                            result_csv=uw3_rlbwxyh_classification_result_csv,#aug_res,
#                                             times_csv=times_csv,
#                            resume_test_idx=resume_idx,#4870,#4510,#0,#9650,#6970,#4360,#3360,#910,#610,#0,
Exemple #5
0
    def knn_test(self,
                 train_csv="/a.csv",
                 test_csv="/a.csv",
                 batch_size=32,
                 num_classes=21,
                 shuffle=True,
                 mean_pixels=[127, 127, 127]):
        tool = victorinox()

        # with tf.device('/gpu:0'):#with tf.device('/cpu:0'):
        #     tr_data = ImageDataGenerator(train_csv,
        #                                  mode='training',
        #                                  batch_size=batch_size,
        #                                  num_classes=num_classes,
        #                                  shuffle=shuffle,
        #                                  mean_pixels=mean_pixels)
        #
        #     val_data = ImageDataGenerator(test_csv,
        #                                   mode='inference',
        #                                   batch_size=batch_size,
        #                                   num_classes=num_classes,
        #                                   shuffle=False,
        #                                   mean_pixels=mean_pixels)
        #     # create an reinitializable iterator given the dataset structure
        #     iterator = Iterator.from_structure(tr_data.data.output_types,
        #                                        tr_data.data.output_shapes)
        #     next_batch = iterator.get_next()
        #     training_init_op = iterator.make_initializer(tr_data.data)
        #     validation_init_op = iterator.make_initializer(val_data.data)
        # Import MNIST data
        # from tensorflow.examples.tutorials.mnist import input_data
        # mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)

        # In this example, we limit mnist data
        Xtr, Ytr = self.get_data(
            train_csv, sep=","
        )  #mnist.train.next_batch(5000)  # 5000 for training (nn candidates)
        Xte, Yte = self.get_data(
            test_csv, sep=",")  #mnist.test.next_batch(200)  # 200 for testing
        # tf Graph Input
        xtr = tf.placeholder(tf.float32, [None, np.shape(Xtr)[1]])
        xte = tf.placeholder(tf.float32, np.shape(Xte)[1])  #[784])

        from tensorflow.contrib.bayesflow.python.ops.csiszar_divergence import jensen_shannon

        # Nearest Neighbor calculation using L1 Distance
        # Calculate L1 Distance

        #distance = tf.reduce_sum(tf.abs(tf.add(xtr, tf.negative(xte))), reduction_indices=1)
        distance = tool.get_average_of_JensenShannon_using_tensorflow(xtr, xte)
        # Prediction: Get min distance index (Nearest neighbor)
        pred = tf.arg_min(distance, 0)

        accuracy = 0.

        # Initialize the variables (i.e. assign their default value)
        init = tf.global_variables_initializer()

        # Start training
        with tf.Session() as sess:

            # Run the initializer
            sess.run(init)

            # loop over test data
            for i in range(len(Xte)):
                # Get nearest neighbor
                nn_index = sess.run(pred, feed_dict={xtr: Xtr, xte: Xte[i, :]})
                # Get nearest neighbor class label and compare it to its true label
                print("Test", i, "Prediction:", np.argmax(Ytr[nn_index]), \
                      "True Class:", np.argmax(Yte[i]))
                # Calculate accuracy
                if np.argmax(Ytr[nn_index]) == np.argmax(Yte[i]):
                    accuracy += 1. / len(Xte)
            print("Done!")
        print("Accuracy:", accuracy)