Exemple #1
0
def get_runs(text_id):
    import os
    import glob
    path_to_runs = get_absolute_textpath(text_id)
    print "path to runs", path_to_runs
    run_dirs = os.listdir(path_to_runs)
    print "run dirs", run_dirs
    run_dirs = [elem for elem in run_dirs if 'output' in elem]
    run_dates = sorted(set([elem[0:16]
                            for elem in run_dirs]))  # gets uniq dates
    print run_dates
    run_info = []
    for run_date in run_dates:
        run = {}
        run_details = {}
        run['date'] = run_date
        dir_for_glob = path_to_runs + '/' + run_date
        sel_hocr_dir_glob = dir_for_glob + '*selected_hocr_output'
        print sel_hocr_dir_glob
        sel_hocr_dir = glob.glob(sel_hocr_dir_glob)[0]
        print "sel_hocr_dir", sel_hocr_dir
        try:
            score_file = open(sel_hocr_dir + '/best_scores_sum.txt')
            run['score'] = score_file.read()
        except IOError:
            run['score'] = 0
        (a, b, classifier) = parse_dir(sel_hocr_dir)
        run['classifier'] = classifier
        output_dirs = possible_output_dirs(dir_for_glob)
        sel_hocr_dir_output = glob.glob(sel_hocr_dir + '/*html')
        #but this is a filesystem directory, not a URL. We need to remove the path to 'static'
        #first_output = first_output[len(APP_ROOT + '/static/Texts/'):]
        run['link'] = ''  #url_for('side_by_side_view', html_path = first_output)
        if len(sel_hocr_dir_output) > 0:
            run_info.append(run)
        else:
            print "there's nothing in", sel_hocr_dir
        #print run_info
    sorted_run_info = sorted(run_info,
                             reverse=True,
                             key=lambda run: float(run['score']))
    return (path_to_runs + '/', sorted_run_info)
Exemple #2
0
def get_runs(text_id):
    import os
    import glob
    path_to_runs = get_absolute_textpath(text_id)
    print "path to runs", path_to_runs
    run_dirs = os.listdir(path_to_runs)
    print "run dirs", run_dirs
    run_dirs = [elem for elem in run_dirs if 'output' in elem ]
    run_dates = sorted(set([elem[0:16] for elem in run_dirs]))# gets uniq dates
    print run_dates
    run_info = []
    for run_date in run_dates:
        run = {}
        run_details = {}
        run['date'] = run_date
        dir_for_glob =  path_to_runs + '/' + run_date
        sel_hocr_dir_glob = dir_for_glob + '*selected_hocr_output'
        print sel_hocr_dir_glob
        sel_hocr_dir = glob.glob(sel_hocr_dir_glob)[0]
        print "sel_hocr_dir", sel_hocr_dir
        try:
            score_file = open(sel_hocr_dir + '/best_scores_sum.txt')
            run['score'] = score_file.read()
        except IOError:
            run['score'] = 0
        (a,b,classifier) = parse_dir(sel_hocr_dir)
        run['classifier'] = classifier
        output_dirs = possible_output_dirs(dir_for_glob)
        sel_hocr_dir_output = glob.glob(sel_hocr_dir + '/*html')
        #but this is a filesystem directory, not a URL. We need to remove the path to 'static'
        #first_output = first_output[len(APP_ROOT + '/static/Texts/'):]
        run['link'] = ''#url_for('side_by_side_view', html_path = first_output)
        if len(sel_hocr_dir_output) > 0:
            run_info.append(run)
        else:
            print "there's nothing in", sel_hocr_dir
        #print run_info
    sorted_run_info = sorted(run_info, reverse=True, key=lambda run: float(run['score']))
    return (path_to_runs + '/', sorted_run_info)
Exemple #3
0
    except Exception as e:
        print e
    print "done downloading. Extracting ..."
    try:
        tar = tarfile.open(name=tar_file_name, mode='r:gz')
    except Exception as e:
        print('ERROR file %s not a valid tar.gz file.' % image_tar_url)
        print e
    try:
        tar.extractall(path=get_image_dir_path())
    except Exception as e:
        print('ERROR couldn\'t put images in %s.' % get_image_dir_path())
        print e


textpath = get_absolute_textpath('')
db.create_all()
page_count = 0
if len(sys.argv) < 2:
    sys.exit('Usage: %s tar-gzip-file' % sys.argv[0])
for file_in in sys.argv[1:]:
    if not os.path.isfile(file_in):
        sys.exit('ERROR: tar file %s was not found!' % file_in)
    try:
        tar = tarfile.open(name=file_in, mode='r:gz')
    except:
        print('ERROR file %s not a valid tar.gz file.' % file_in)
        continue
    (route, file_name) = os.path.split(file_in)
    print file_name
    #(name_label, date, identifier, file_type, classifier,stuff)
#!/usr/bin/python
from lace import  db, Archivetext, Ocrrun, Outputpage, Hocrtype, POSSIBLE_HOCR_VIEWS
import glob, os, sys, tarfile
from lace import get_absolute_textpath, APP_ROOT
from populate_db import get_page_scores, collect_archive_text_info, get_runs, int_for_hocr_type_string, string_for_hocr_type_int
DEBUG = True
textpath = get_absolute_textpath('')
db.create_all()
page_count = 0
if len(sys.argv) < 2:
    sys.exit('Usage: %s archive_id' % sys.argv[0])
t = db.session.query(Archivetext).filter_by(archive_number=sys.argv[1]).first()
print t
if not t:
    print "there is no archive id " + sys.argv[1] + " in lace"
else:
    db.session.delete(t)
    print "deleting",t
    runs = db.session.query(Ocrrun).filter_by(archivetext_id = t.id)
    for run in runs:
        print "deleting", run
        print run
        db.session.delete(run)
        hocr_types = db.session.query(Hocrtype).filter_by(ocrrun_id = run.id)
        for hocr_type in hocr_types:
            print "deleting",hocr_type
            db.session.delete(hocr_type)
            pages = db.session.query(Outputpage).filter_by(hocrtype_id = hocr_type.id)
            print "and", pages.count(), "pages"
            for page in pages:
                print "deleting page", page
    except Exception as e:
        print e
    print "done downloading. Extracting ..."
    try:
        tar = tarfile.open(name=tar_file_name, mode="r:gz")
    except Exception as e:
        print ("ERROR file %s not a valid tar.gz file." % image_tar_url)
        print e
    try:
        tar.extractall(path=get_image_dir_path())
    except Exception as e:
        print ("ERROR couldn't put images in %s." % get_image_dir_path())
        print e


textpath = get_absolute_textpath("")
db.create_all()
page_count = 0
if len(sys.argv) < 2:
    sys.exit("Usage: %s tar-gzip-file" % sys.argv[0])
for file_in in sys.argv[1:]:
    if not os.path.isfile(file_in):
        sys.exit("ERROR: tar file %s was not found!" % file_in)
    try:
        tar = tarfile.open(name=file_in, mode="r:gz")
    except:
        print ("ERROR file %s not a valid tar.gz file." % file_in)
        continue
    (route, file_name) = os.path.split(file_in)
    print file_name
    # (name_label, date, identifier, file_type, classifier,stuff)