Exemple #1
0
def test():

    # reconstruction documents of the dataset D2
    docs = ['datasets/D1/mechanical/D{:03}'.format(i) for i in range(1, 62) if i != 3]
    docs += ['datasets/D2/mechanical/D{:03}'.format(i) for i in range(1, 21)]
    docs += ['datasets/D3/mechanical/D{:03}'.format(i) for i in range(1, 101)]
    
    # load strips
    strips_list = [Strips(path=doc, filter_blanks=True) for doc in docs]
    #random.shuffle(docs)
    #docs = docs[: MAX_NUM_DOCS]
    for doc, strips in zip(docs, strips_list):
        map_doc_strips[doc] = strips
        # each individual strip of a document is assigned an unique ID.
        # map document filename into the ids of such document strips
        # map_doc_ids = {}
        # cum = 0
        
        # map_doc_ids[doc] = list(range(cum, cum + size))
        # cum += size  
        size = len(strips.strips)
        # instances
        instance = {
            'docs': [doc],
            'accuracy': None,
            'sizes': [size],
            'opt_time': None
        }
        result = solve(instance)
        print('doc={} accuracy={:.2f}% opt_time={:.2f}s'.format(
            doc, 100 * result['accuracy'], result['opt_time']
        ))
def test():

    # reconstruction documents of the dataset D2
    docs = ['datasets/D2/mechanical/D{:03}'.format(i) for i in range(1, 21)]
    
    # pick MAX_NUM_DOCS documents
    random.shuffle(docs)
    docs = docs[: MAX_NUM_DOCS]
    
    # load strips
    strips_list = [Strips(path=doc, filter_blanks=True) for doc in docs]
    
    # each individual strip of a document is assigned an unique ID.
    # map document filename into the ids of such document strips
    map_doc_ids = {}
    cum = 0
    for doc, strips in zip(docs, strips_list):
        size = len(strips.strips)
        map_doc_strips[doc] = strips  # map_doc_strips is a global variable
        map_doc_ids[doc] = list(range(cum, cum + size))
        cum += size

    # main loop
    start = time()
    for k in range(1, MAX_NUM_DOCS + 1):
        # picking k documents
        picked_docs = docs[ : k]

        # union of ids of the picked docs' strips
        sizes_k = []
        for doc in picked_docs:
            sizes_k.append(len(map_doc_ids[doc]))

        # instances
        instance = {
            'docs': picked_docs,
            'accuracy': None,
            'sizes': sizes_k,
            'opt_time': None
        }
        result = solve(instance)
        print('k={} accuracy={:.2f}% opt_time={:.2f}s'.format(
            k, 100 * result['accuracy'], result['opt_time']
        ))
def test(args, radius=15, hdisp=3, disp_noise=2, pcont=0.3):

    docs = glob.glob('{}/**/*.tif'.format(ISRI_DATASET_DIR), recursive=True)
    for f, fname in enumerate(docs, 1):
        print('Processing document {}/{}'.format(f, len(docs)))
        if os.path.basename(fname).replace('.tif', '') in ignore_images:
            print('{} is no considered to compose the dataset.'.format(fname))
            continue

        # generate temporary strips
        print('     => Shredding')
        if os.path.exists(TEMP_DIR):
            shutil.rmtree(TEMP_DIR)
        os.makedirs('{}/strips'.format(TEMP_DIR))
        image = cv2.imread(fname)
        h, w, c = image.shape
        acc = 0
        for i in range(args.num_strips):
            dw = int((w - acc) / (args.num_strips - i))
            strip = image[:, acc:acc + dw]
            noise_left = np.random.randint(0, 255,
                                           (h, disp_noise)).astype(np.uint8)
            noise_right = np.random.randint(0, 255,
                                            (h, disp_noise)).astype(np.uint8)
            for j in range(c):  # for each channel
                strip[:, :disp_noise, j] = cv2.add(strip[:, :disp_noise, j],
                                                   noise_left)
                strip[:, -disp_noise:, j] = cv2.add(strip[:, -disp_noise:, j],
                                                    noise_right)
            cv2.imwrite('{}/strips/D001{:02}.jpg'.format(TEMP_DIR, i + 1),
                        strip)
            acc += dw

        print('     => Load strips object', end='')
        strips = Strips(path=TEMP_DIR, filter_blanks=True)
        print('done!')
Exemple #4
0
import sys
import matplotlib.pyplot as plt
import time

sys.path.append('.')
from docrec.strips.strips import Strips

# segmentation
print('=> Segmentation')
t0 = time.time()
strips = Strips(path='datasets/D2/mechanical/D002', filter_blanks=True)
strips.plot()
plt.show()
print('Strips elapsed time={:.2f} seconds'.format(time.time() - t0))
N = len(strips.strips)
fig = plt.figure(figsize=(8, 8), dpi=150)
for i in range(N):
    for j in range(N):
        if i + 1 == j:
            t0 = time.time()
            print(i, j, N)
            image = strips.pair(i, j, filled=True)
            print('Pairing time={:.2f} seconds'.format(time.time() - t0))
            plt.clf()
            plt.imshow(image)
            #plt.savefig('/home/thiagopx/temo/{}-{}.pdf'.format(i, j))
            plt.show()
#strips.plot()

parser.add_argument(
    '-j', '--j', action='store', dest='j', required=False, type=int,
    default=3, help='Sj.'
)
args = parser.parse_args()

# model
images_ph = tf.placeholder(tf.float32, name='images_ph', shape=(None, 3, input_size, input_size)) # channels first
images_adjust_op = tf.image.convert_image_dtype(images_ph, tf.float32)
logits_op = squeezenet(images_ph, 'val', 2, channels_first=True)
probs_op = tf.nn.softmax(logits_op)
predictions_op = tf.argmax(logits_op, 1)

# pair
i, j = args.i, args.j
strips = Strips(path=args.doc, filter_blanks=True)
si, sj = strips.strips[i], strips.strips[j]
hi, wi, _ = si.image.shape
hj, wj, _ = sj.image.shape
min_y = radius_search + radius_feat
max_y = min(hi, hj) - 1 - radius_search - radius_feat
smi = np.correlate(si.offsets_r, [0.05, 0.1, 0.7, 0.1, 0.05], mode='same')
smj = np.correlate(sj.offsets_l, [0.05, 0.1, 0.7, 0.1, 0.05], mode='same')
support = np.hstack([si.filled_image(), sj.filled_image()])
hs, ws, _ = support.shape
blank = np.zeros((hs, 31, 3), dtype=np.uint8)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    params_fname = open('best_model.txt').read()
    load(params_fname, sess, model_scope='SqueezeNet')
Exemple #6
0
def test():

    global solver
    global extra_args

    # parameters processing
    parser = argparse.ArgumentParser(
        description='Testing reconstruction of mixed documents.')
    parser.add_argument('-d',
                        '--dataset',
                        action='store',
                        dest='dataset',
                        required=False,
                        type=str,
                        default='cdip',
                        help='Dataset [D1, D2, or cdip].')
    parser.add_argument('-np',
                        '--nproc',
                        action='store',
                        dest='nproc',
                        required=False,
                        type=int,
                        default=10,
                        help='Number of processes.')
    args = parser.parse_args()

    assert args.dataset in ['D1', 'D2', 'cdip']

    # reconstruction instances
    if args.dataset == 'D1':
        docs = [
            'datasets/D1/mechanical/D{:03}'.format(i) for i in range(1, 62)
            if i != 3
        ]
    elif args.dataset == 'D2':
        docs = [
            'datasets/D2/mechanical/D{:03}'.format(i) for i in range(1, 21)
        ]
    else:
        docs = [
            'datasets/D3/mechanical/D{:03}'.format(i) for i in range(1, 101)
        ]  # cdip
    # shuffle documents
    random.shuffle(docs)  # this should be considered in other scripts
    ndocs = len(docs)

    # global compatibility matrix
    compatibilities = json.load(
        open('results/exp1_proposed/{}_matrix.json'.format(args.dataset), 'r'))
    compatibilities = np.array(
        result_glob['compatibilities']
    )  # generated with permutated documents (same seed)

    # strips ids for each document
    cum = 0
    ids_strips = {}
    for doc in docs:
        size = len(Strips(path=doc, filter_blanks=True).strips)
        ids_strips[doc] = list(range(cum, cum + size))
        cum += size

    # results / initial configuration
    ndocs_per_iter = [1, 2, 3, 4, 5] + list(range(10, ndocs + 1, 5))
    ndocs_per_iter.reverse()  # process in reverve order
    results = {
        'matrix_id': '{}_matrix'.format(args.dataset),
        # 'k_init': 0,                                # index of ndocs_per_iter where the process should start
        'backup_iter': 0,  # last iteration
        'state': None,  # random number generator state
        'data': {str(k): []
                 for k in ndocs_per_iter}  # experimental results data
    }

    results_fname = 'results/exp1_proposed/{}.json'.format(
        dir_name, args.dataset)
    if os.path.exists(results_fname):
        results = json.load(open(results_fname))
    it = results['backup_iter']

    state = results['state']
    if state is not None:
        state = (state[0], tuple(state[1]), state[2])
        random.setstate(state)

    # main loop
    with Pool(processes=args.nproc) as pool:
        start = time()
        total = sum([ndocs - k + 1 for k in ndocs_per_iter])
        combs = [(k, offset) for k in ndocs_per_iter
                 for offset in range(ndocs - k + 1)]
        instances = []
        for k, offset in combs[it:]:
            # picking documents
            picked_docs = docs[offset:offset + k]

            # union of ids
            ids_strips_k = []
            sizes_k = []
            for doc in sorted(picked_docs):
                ids_strips_k += ids_strips[doc]
                sizes_k.append(len(ids_strips[doc]))

            # crop compatibilities
            compatibilities_k = compatibilities[ids_strips_k][:, ids_strips_k]

            # shuffle strips
            N = compatibilities_k.shape[0]
            init_perm_k = list(range(N))
            random.shuffle(init_perm_k)

            # shuffle compatibilites
            compatibilities_k = compatibilities_k[init_perm_k][:, init_perm_k]

            # update instances
            instance = {
                'k': str(k),
                'offset': offset,
                'docs': picked_docs,
                'solution': None,
                'accuracy': None,
                'init_perm': init_perm_k,
                'sizes': sizes_k,
                'compatibilities': compatibilities_k,
                'opt_time': None
            }
            instances.append(instance)

            # run when the buffer size is greater then args.nproc or when the last iteration is achieved
            if (len(instances) == args.nproc) or (it + len(instances)
                                                  == total):
                print('Iterations {}-{}/{}'.format(it + 1, it + len(instances),
                                                   total),
                      end=' ')
                results_buffer = pool.map(solve, instances)
                it += len(instances)
                instances = []  # reset instances
                elapsed = time() - start
                predicted = elapsed * (total - it) / it
                print(':: elapsed={:.2f}s :: predicted={:.2f}s'.format(
                    elapsed, predicted))
                for result in results_buffer:
                    results['data'][result['k']].append(result)
                    print(
                        '    => k={} offset={} accuracy={:.2f}% opt_time={:.2f}s'
                        .format(result['k'], result['offset'],
                                100 * result['accuracy'], result['opt_time']))

                # dump results and current state
                results['backup_iter'] = it
                results['state'] = random.getstate()
                json.dump(results, open(results_fname, 'w'))
Exemple #7
0
    logits_op = model.output
    conv10_op = model.view

    probs_op = tf.nn.softmax(logits_op)
    comp_op = tf.reduce_max(probs_op[:, 1])
    disp_op = tf.argmax(probs_op[:, 1]) - args.vshift

    sess.run(tf.global_variables_initializer())
    best_epoch = json.load(open('traindata/{}/info.json'.format(args.arch), 'r'))['best_epoch']
    weights_path = 'traindata/{}/model/{}.{}'.format(args.arch, best_epoch, model_file_ext)
    model.load_weights(weights_path)

    base_path = 'illustration/heatmap/{}'.format(args.arch)
    os.makedirs(base_path, exist_ok=True)

    strips_regular = Strips(path=args.doc, filter_blanks=True)
    strips_shuffled = strips_regular.copy().shuffle()

    N = strips_regular.size()
    for strips, strips_type in zip([strips_regular, strips_shuffled], ['regular', 'shuffled']):

        # features
        features = []
        for strip in strips.strips:
            left, right = extract_features(strip, (input_size_h, input_size_w))
            features.append((left, right))

        # converto to BGR
        for strip in strips.strips:
            strip.image = strip.image[..., :: - 1]
Exemple #8
0
from docrec.strips.strips import Strips


def features(points):
    feat = []
    for (x3, y3), (x2, y2), (x1, y1) in zip(points[2:], points[1:-1],
                                            points[:-2]):
        val = (y3 - y2) * (y2 - y1) + (x3 - x2) * (x2 - x1)
        feat.append(val)
    return np.array(feat)


doc = 'datasets/D2/mechanical/D002'
i = 21
j = 22
strips = Strips(path=doc, filter_blanks=True)
si, sj = strips.strips[i], strips.strips[j]
hi, wi, _ = si.image.shape
hj, wj, _ = sj.image.shape

offset = wi
stride_feat = 5
radius_feat = 20
radius_search = 5
num_points = 100
#num_feat = 2 * int(radius_feat / stride_feat)
min_y = radius_search + radius_feat
max_y = min(hi, hj) - 1 - radius_search - radius_feat
stride = int((max_y - min_y) / (num_points - 1))
smi = np.correlate(si.offsets_r, [0.05, 0.1, 0.7, 0.1, 0.05], mode='same')
smj = np.correlate(sj.offsets_l, [0.05, 0.1, 0.7, 0.1, 0.05], mode='same')
Exemple #9
0
    Pipeline(sleit, solvers_min)
]

# reconstruction instances
docs1 = [
    'datasets/D1/mechanical/D{:03}'.format(i) for i in range(1, 62) if i != 3
]
docs2 = ['datasets/D2/mechanical/D{:03}'.format(i) for i in range(1, 21)]
docs = docs1 + docs2

processed = 1
total = len(docs) * len(pipelines)
results = dict()
for doc in docs:
    t0 = time.time()
    strips = Strips(path=doc, filter_blanks=True)
    strips.shuffle()
    init_permutation = strips.permutation()
    t_load = time.time() - t0
    results[doc] = dict(init_permutation=init_permutation,
                        time=t_load,
                        algorithms=dict())
    for pipeline in pipelines:
        print('[{:.2f}%] algorithm={} doc={} ::'.format(
            100 * processed / total, pipeline.algorithm.id(), doc),
              end='')
        processed += 1
        d = 2 if pipeline.algorithm.id() == 'marques' else 0
        pipeline.run(strips, d)
        results[doc]['algorithms'][pipeline.algorithm.id()] = dict()
        results[doc]['algorithms'][
Exemple #10
0
                     weights_path,
                     10, (3000, width),
                     num_classes=NUM_CLASSES,
                     verbose=False,
                     thresh=args.thresh)
if args.solver == 'concorde':
    solver = SolverConcorde(maximize=True, max_precision=2)
elif args.solver == 'kbh':
    solver = SolverKBH(maximize=True)
else:
    solver = SolverLS(maximize=True)
pipeline = Pipeline(algorithm, [solver])

# load strips and shuffle the strips
print('1) Load strips')
strips = Strips(path=args.doc, filter_blanks=True)
strips.shuffle()
init_permutation = strips.permutation()
print('Shuffled order: ' + str(init_permutation))

print('2) Results')
pipeline.run(strips)
# matrix -> list (displacements according the neighobors strips in solution)
compatibilities = pipeline.algorithm.compatibilities
displacements = pipeline.algorithm.displacements
solution = pipeline.solvers[0].solution
displacements = [
    displacements[prev][curr]
    for prev, curr in zip(solution[:-1], solution[1:])
]
corrected = [init_permutation[idx] for idx in solution]
    print('Processing document {}/{}'.format(f, len(docs)))
    if os.path.basename(fname).replace('.tif', '') in ignore_images:
        print('{} is no considered to compose the dataset.'.format(fname))
        continue

    # generate temporary strips
    print('     => Shredding')
    if os.path.exists(TEMP_DIR):
        shutil.rmtree(TEMP_DIR)
    os.makedirs('{}/strips'.format(TEMP_DIR))
    image = cv2.imread(fname)
    h, w, c = image.shape
    acc = 0
    for i in range(30):
        dw = int((w - acc) / (30 - i))
        strip = image[:, acc:acc + dw]
        noise_left = np.random.randint(0, 255,
                                       (h, disp_noise)).astype(np.uint8)
        noise_right = np.random.randint(0, 255,
                                        (h, disp_noise)).astype(np.uint8)
        for j in range(c):  # for each channel
            strip[:, :disp_noise, j] = cv2.add(strip[:, :disp_noise, j],
                                               noise_left)
            strip[:, -disp_noise:, j] = cv2.add(strip[:, -disp_noise:, j],
                                                noise_right)
        cv2.imwrite('{}/strips/D001{:02}.jpg'.format(TEMP_DIR, i + 1), strip)
        acc += dw

    print('     => Load strips object')
    strips = Strips(path=TEMP_DIR, filter_blanks=True)
Exemple #12
0
# solver = [SolverLS(maximize=True)]
solvers = [
    SolverConcorde(maximize=False, max_precision=2),
    SolverKBH(maximize=False)
]

# reconstruction instances
docs1 = [
    'datasets/D1/mechanical/D{:03}'.format(i) for i in range(1, 62) if i != 3
]
docs2 = ['datasets/D2/mechanical/D{:03}'.format(i) for i in range(1, 21)]
docs = docs1 + docs2

# reconstruction instances
strips_all = {
    doc: Strips(path=doc, filter_blanks=True).shuffle()
    for doc in docs
}

processed = 1
total = len(docs) * len(algorithms)
records = []
for algorithm in algorithms:
    d = 2 if algorithm.id() == 'marques' else 0
    for doc, strips in strips_all.items():
        print('[{:.2f}%] algorithm={} doc={}'.format(100 * processed / total,
                                                     algorithm.id(), doc))
        processed += 1
        init_permutation = strips.permutation()
        compatibilities = algorithm(strips=strips, d=d).compatibilities
        qc = Qc(compatibilities,
Exemple #13
0
predictions_op = tf.argmax(logits_op, 1)

# segmentation
print('Processing document {}'.format(args.doc))

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    params_fname = open('best_model.txt').read()
    load(params_fname, sess, model_scope='SqueezeNet')
    classes = ['negative', 'positive']

    t0_global = time.time()
    print('=> Segmentation')
    t0 = time.time()
    #strips = StripsText(path=args.doc, filter_blanks=True)
    strips = Strips(path=args.doc, filter_blanks=True)
    print('Segmentation elapsed time={:.2f} seconds'.format(time.time() - t0))

    N = len(strips.strips)
    pcont = 0.2
    scores = np.zeros((N, N), dtype=np.int32)
    for i in range(N - 1):
        border = strips.strips[i].offsets_r
        j = i + 1

        print('=> Scoring [{}][{}]'.format(i, j))
        image = strips.pair(i, j, accurate=True, filled=True)
        h, w, _ = image.shape
        _, image_bin = cv2.threshold(cv2.cvtColor(image, cv2.COLOR_RGB2GRAY),
                                     0, 1, cv2.THRESH_BINARY +
                                     cv2.THRESH_OTSU)  # range [0, 1]
import sys
import os
import cv2

sys.path.append('.')
from docrec.strips.strips import Strips

docs = ['datasets/D2/mechanical/D{:03}'.format(i) for i in range(1, 21)][:1]
for doc in docs:
    print('Processing document {}'.format(doc))
    strips = Strips(path=doc, filter_blanks=True)
    image = strips.reconstruction_image()
    cv2.imwrite('test/test_recimage/{}.jpg'.format(os.path.basename(doc)),
                image)
def main():

    # parameters processing
    parser = argparse.ArgumentParser(
        description='Testing reconstruction of mixed documents.')
    parser.add_argument('-d',
                        '--dataset',
                        action='store',
                        dest='dataset',
                        required=False,
                        type=str,
                        default='cdip',
                        help='Dataset [D1, D2, or cdip].')
    parser.add_argument('-m',
                        '--model-id',
                        action='store',
                        dest='model_id',
                        required=False,
                        type=str,
                        default=None,
                        help='Model identifier (tag).')
    args = parser.parse_args()

    assert args.dataset in ['D1', 'D2', 'cdip']

    # save directory
    save_dir = 'results/exp1_proposed'
    os.makedirs(save_dir, exist_ok=True)

    results_fname = '{}/{}_matrix.json'.format(save_dir, args.dataset)

    # system setup
    weights_path = json.load(
        open('traindata/{}/info.json'.format(args.model_id),
             'r'))['best_model']
    algorithm = Proposed('sn',
                         weights_path,
                         10, (3000, 32),
                         num_classes=2,
                         verbose=True,
                         thresh_method='sauvola',
                         seed=SEED)

    # reconstruction instances
    if args.dataset == 'D1':
        docs = [
            'datasets/D1/mechanical/D{:03}'.format(i) for i in range(1, 62)
            if i != 3
        ]
    elif args.dataset == 'D2':
        docs = [
            'datasets/D2/mechanical/D{:03}'.format(i) for i in range(1, 21)
        ]
    else:
        docs = [
            'datasets/D3/mechanical/D{:03}'.format(i) for i in range(1, 101)
        ]  # cdip
    # shuffle documents
    random.shuffle(docs)
    ndocs = len(docs)

    # build all
    strips_list = [Strips(path=doc, filter_blanks=True) for doc in docs]
    strips = MixedStrips(strips_list, shuffle=False)
    t0 = time()
    algorithm.run(strips, 0)
    comp_time = time() - t0
    print('ndocs={} inf_time={:.2f}% comp_time={:.2f}s'.format(
        ndocs, algorithm.inference_time, comp_time))
    results = {
        'init_perm': strips.init_perm,
        'sizes': strips.sizes,
        'compatibilities': algorithm.compatibilities.tolist(),
        'displacements': algorithm.displacements.tolist(),
        'inf_time': algorithm.inference_time,
        'comp_time': comp_time
    }
    json.dump(results, open(results_fname, 'w'))