Beispiel #1
0
                    if args.v:
                        print 'error reading scandata file: ', scandata_file
                    continue

                scandata_pages = scandata.find('pageData').findall('page')

                if args.v:
                    print 'Loaded', len(
                        scandata_pages), ' scandata pages in', clock() - t, 's'

                ia_page_index = 0
                scandata_index = 0

                for page in scandata_pages:

                    if (skipScanDataPage(page)):
                        scandata_index += 1
                        continue

                    # db_item = collection.find_one({'scan_id': row[0], 'ia_page_num': ia_page_index})

                    # if (db_item is None):

                    info = {
                        'scan_id': row[0],
                        'scandata_index': scandata_index,
                        'ia_page_num': ia_page_index,
                        'page_num': ia_page_index + 1,
                        'leaf_num': page.get('leafNum'),
                        'has_illustration': {
                            'gold_standard': False
Beispiel #2
0
                scan_file_string = scan_file_string.replace('xmlns="http://archive.org/scribe/xml"', '')
                scandata = ET.fromstring(scan_file_string)
            except:
                if args.v:
                    print 'error reading scandata file: ', scandata_file
                continue

            scandata_pages = scandata.find('pageData').findall('page')

            if args.v:
                print 'Loaded', len(scandata_pages), ' scandata pages in', clock() - t, 's'

            ia_page_index = 0
            scandata_index = 0

        while(scandata_index < len(scandata_pages) and skipScanDataPage(scandata_pages[scandata_index])):
            if args.v:
                print 'Skipping ', scandata_index
            scandata_index += 1

        db_item = collection.find_one({'scan_id': row[0], 'ia_page_num': ia_page_index})

        if (db_item is None):

            info = {
                'scan_id': row[0],
                'scandata_index': scandata_index,
                'ia_page_num': ia_page_index,
                'page_num': ia_page_index + 1,
                'leaf_num': scandata_pages[scandata_index].get('leafNum') if (scandata_index < len(scandata_pages)) else '',
                'has_illustration': {
            print 'parsing scandata'
            t = clock()
            scandata = ET.parse(scandata_file)
            scandata_pages = scandata.find('pageData').findall('page')
            print 'found', len(scandata_pages), 'pages from scan data in', clock() - t, 's'

            print 'parsing abbyy'
            t = clock()
            abbyy = ET.parse(gzip.open(abbyy_file))
            abbyy_pages = abbyy.findall('{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}page')
            print 'found', len(abbyy_pages), 'pages from abbyy data in', clock() - t, 's'

            ia_page_index = 0
            scandata_index = 0

        while(skipScanDataPage(scandata_pages[scandata_index])):
            scandata_index += 1

        result = processPage(
            scan_id,
            ia_page_index,
            scandata_pages[scandata_index],
            abbyy_pages[scandata_index],
            False
        )

        output_writer.writerow([
            row[0],
            row[3],
            result['abbyy_processing'],
            result['n_picture_blocks'],
Beispiel #4
0
                except:
                    if args.v:
                        print 'error reading scandata file: ', scandata_file
                    continue

                scandata_pages = scandata.find('pageData').findall('page')

                if args.v:
                    print 'Loaded', len(scandata_pages), ' scandata pages in', clock() - t, 's'

                ia_page_index = 0
                scandata_index = 0

                for page in scandata_pages:

                    if (skipScanDataPage(page)):
                        scandata_index += 1
                        continue

                    # db_item = collection.find_one({'scan_id': row[0], 'ia_page_num': ia_page_index})

                    # if (db_item is None):

                    info = {
                        'scan_id': row[0],
                        'scandata_index': scandata_index,
                        'ia_page_num': ia_page_index,
                        'page_num': ia_page_index + 1,
                        'leaf_num': page.get('leafNum'),
                        'has_illustration': {
                            'gold_standard': False
            print "parsing scandata"
            t = clock()
            scandata = ET.parse(scandata_file)
            scandata_pages = scandata.find("pageData").findall("page")
            print "found", len(scandata_pages), "pages from scan data in", clock() - t, "s"

            print "parsing abbyy"
            t = clock()
            abbyy = ET.parse(gzip.open(abbyy_file))
            abbyy_pages = abbyy.findall("{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}page")
            print "found", len(abbyy_pages), "pages from abbyy data in", clock() - t, "s"

            ia_page_index = 0
            scandata_index = 0

        while skipScanDataPage(scandata_pages[scandata_index]):
            scandata_index += 1

        result = processPage(scan_id, ia_page_index, scandata_pages[scandata_index], abbyy_pages[scandata_index], False)

        output_writer.writerow(
            [row[0], row[3], result["abbyy_processing"], result["n_picture_blocks"], result["coverage"]]
        )

        ia_page_index += 1
        scandata_index += 1

    output_file.close()
    control_file.close()

    print "Finished in", (clock() - t0)
Beispiel #6
0
def runCSV():

    import argparse
    import gzip
    from helpers import skipScanDataPage
    from xml.etree import cElementTree as ET
    import sys

    ap = argparse.ArgumentParser(description='picture block processing')
    ap.add_argument('scan', type=str, help='scan id')
    ap.add_argument('--page', type=int, help='page #', default=None)
    ap.add_argument('--render', type=bool, help='render blocks', default=False)

    args = ap.parse_args()

    #scan_id = 'hallofshells00hard'
    scan_id = args.scan

    abbyy_file = 'scandata/%s/%s_abbyy.gz' % (scan_id, scan_id)
    scandata_file = 'scandata/%s/%s_scandata.xml' % (scan_id, scan_id)

    print 'parsing scandata'
    t = clock()
    scandata = ET.parse(scandata_file)
    scandata_pages = scandata.find('pageData').findall('page')
    print 'found', len(scandata_pages), 'pages from scan data in', clock() - t, 's'

    print 'parsing abbyy'
    t = clock()
    abbyy = ET.parse(gzip.open(abbyy_file))
    abbyy_pages = abbyy.findall('{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}page')
    print 'found', len(abbyy_pages), 'pages from abbyy data in', clock() - t, 's'

    results = []
    ia_page_index = 0
    for i in range(0, len(scandata_pages)):
        if skipScanDataPage(scandata_pages[i]):
            continue
        if (args.page == None):
            # process all pages
            results.append(processPage(
                scan_id,
                ia_page_index,
                scandata_pages[i],
                abbyy_pages[i],
                args.render
            ))
        elif (i == args.page):
            break
        ia_page_index += 1

    if (args.page != None):
        print processPage(scan_id, ia_page_index, scandata_pages[i], abbyy_pages[i], args.render)
        sys.exit()

    import csv
    output_filename = 'output/pictureblocks/%s-pictureblocks.csv' % scan_id
    if not os.path.exists(os.path.dirname(output_filename)):
        os.mkdir(os.path.dirname(output_filename))

    output_file = open(output_filename, 'w')
    writer = csv.writer(output_file)
    writer.writerow([
        'IA page',
        'Image detected',
        'Processing time',
        '# of picture blocks',
        '% coverage',
        'intersection'
    ])

    for p in range(0, len(results)):
        #print p
        writer.writerow([
            p,
            results[p]['image_detected'],
            results[p]['abbyy_processing'],
            results[p]['n_picture_blocks'],
            results[p]['coverage'],
            results[p]['blocks_intersect']
        ])
        if (results[p]['image_detected']):
            print 'Image detected on page', p

    output_file.close()

    if (args.render):
        print 'Avg image processing time:', average(benchmarks['image_processing']), 's'
Beispiel #7
0
def runCSV():

    import argparse
    import gzip
    from helpers import skipScanDataPage
    from xml.etree import cElementTree as ET
    import sys

    ap = argparse.ArgumentParser(description='picture block processing')
    ap.add_argument('scan', type=str, help='scan id')
    ap.add_argument('--page', type=int, help='page #', default=None)
    ap.add_argument('--render', type=bool, help='render blocks', default=False)

    args = ap.parse_args()

    #scan_id = 'hallofshells00hard'
    scan_id = args.scan

    abbyy_file = 'scandata/%s/%s_abbyy.gz' % (scan_id, scan_id)
    scandata_file = 'scandata/%s/%s_scandata.xml' % (scan_id, scan_id)

    print 'parsing scandata'
    t = clock()
    scandata = ET.parse(scandata_file)
    scandata_pages = scandata.find('pageData').findall('page')
    print 'found', len(
        scandata_pages), 'pages from scan data in', clock() - t, 's'

    print 'parsing abbyy'
    t = clock()
    abbyy = ET.parse(gzip.open(abbyy_file))
    abbyy_pages = abbyy.findall(
        '{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}page')
    print 'found', len(
        abbyy_pages), 'pages from abbyy data in', clock() - t, 's'

    results = []
    ia_page_index = 0
    for i in range(0, len(scandata_pages)):
        if skipScanDataPage(scandata_pages[i]):
            continue
        if (args.page == None):
            # process all pages
            results.append(
                processPage(scan_id, ia_page_index, scandata_pages[i],
                            abbyy_pages[i], args.render))
        elif (i == args.page):
            break
        ia_page_index += 1

    if (args.page != None):
        print processPage(scan_id, ia_page_index, scandata_pages[i],
                          abbyy_pages[i], args.render)
        sys.exit()

    import csv
    output_filename = 'output/pictureblocks/%s-pictureblocks.csv' % scan_id
    if not os.path.exists(os.path.dirname(output_filename)):
        os.mkdir(os.path.dirname(output_filename))

    output_file = open(output_filename, 'w')
    writer = csv.writer(output_file)
    writer.writerow([
        'IA page', 'Image detected', 'Processing time', '# of picture blocks',
        '% coverage', 'intersection'
    ])

    for p in range(0, len(results)):
        #print p
        writer.writerow([
            p, results[p]['image_detected'], results[p]['abbyy_processing'],
            results[p]['n_picture_blocks'], results[p]['coverage'],
            results[p]['blocks_intersect']
        ])
        if (results[p]['image_detected']):
            print 'Image detected on page', p

    output_file.close()

    if (args.render):
        print 'Avg image processing time:', average(
            benchmarks['image_processing']), 's'