def get_runs(text_id): import os import glob path_to_runs = get_absolute_textpath(text_id) print "path to runs", path_to_runs run_dirs = os.listdir(path_to_runs) print "run dirs", run_dirs run_dirs = [elem for elem in run_dirs if 'output' in elem] run_dates = sorted(set([elem[0:16] for elem in run_dirs])) # gets uniq dates print run_dates run_info = [] for run_date in run_dates: run = {} run_details = {} run['date'] = run_date dir_for_glob = path_to_runs + '/' + run_date sel_hocr_dir_glob = dir_for_glob + '*selected_hocr_output' print sel_hocr_dir_glob sel_hocr_dir = glob.glob(sel_hocr_dir_glob)[0] print "sel_hocr_dir", sel_hocr_dir try: score_file = open(sel_hocr_dir + '/best_scores_sum.txt') run['score'] = score_file.read() except IOError: run['score'] = 0 (a, b, classifier) = parse_dir(sel_hocr_dir) run['classifier'] = classifier output_dirs = possible_output_dirs(dir_for_glob) sel_hocr_dir_output = glob.glob(sel_hocr_dir + '/*html') #but this is a filesystem directory, not a URL. We need to remove the path to 'static' #first_output = first_output[len(APP_ROOT + '/static/Texts/'):] run['link'] = '' #url_for('side_by_side_view', html_path = first_output) if len(sel_hocr_dir_output) > 0: run_info.append(run) else: print "there's nothing in", sel_hocr_dir #print run_info sorted_run_info = sorted(run_info, reverse=True, key=lambda run: float(run['score'])) return (path_to_runs + '/', sorted_run_info)
def get_runs(text_id): import os import glob path_to_runs = get_absolute_textpath(text_id) print "path to runs", path_to_runs run_dirs = os.listdir(path_to_runs) print "run dirs", run_dirs run_dirs = [elem for elem in run_dirs if 'output' in elem ] run_dates = sorted(set([elem[0:16] for elem in run_dirs]))# gets uniq dates print run_dates run_info = [] for run_date in run_dates: run = {} run_details = {} run['date'] = run_date dir_for_glob = path_to_runs + '/' + run_date sel_hocr_dir_glob = dir_for_glob + '*selected_hocr_output' print sel_hocr_dir_glob sel_hocr_dir = glob.glob(sel_hocr_dir_glob)[0] print "sel_hocr_dir", sel_hocr_dir try: score_file = open(sel_hocr_dir + '/best_scores_sum.txt') run['score'] = score_file.read() except IOError: run['score'] = 0 (a,b,classifier) = parse_dir(sel_hocr_dir) run['classifier'] = classifier output_dirs = possible_output_dirs(dir_for_glob) sel_hocr_dir_output = glob.glob(sel_hocr_dir + '/*html') #but this is a filesystem directory, not a URL. We need to remove the path to 'static' #first_output = first_output[len(APP_ROOT + '/static/Texts/'):] run['link'] = ''#url_for('side_by_side_view', html_path = first_output) if len(sel_hocr_dir_output) > 0: run_info.append(run) else: print "there's nothing in", sel_hocr_dir #print run_info sorted_run_info = sorted(run_info, reverse=True, key=lambda run: float(run['score'])) return (path_to_runs + '/', sorted_run_info)
except Exception as e: print e print "done downloading. Extracting ..." try: tar = tarfile.open(name=tar_file_name, mode='r:gz') except Exception as e: print('ERROR file %s not a valid tar.gz file.' % image_tar_url) print e try: tar.extractall(path=get_image_dir_path()) except Exception as e: print('ERROR couldn\'t put images in %s.' % get_image_dir_path()) print e textpath = get_absolute_textpath('') db.create_all() page_count = 0 if len(sys.argv) < 2: sys.exit('Usage: %s tar-gzip-file' % sys.argv[0]) for file_in in sys.argv[1:]: if not os.path.isfile(file_in): sys.exit('ERROR: tar file %s was not found!' % file_in) try: tar = tarfile.open(name=file_in, mode='r:gz') except: print('ERROR file %s not a valid tar.gz file.' % file_in) continue (route, file_name) = os.path.split(file_in) print file_name #(name_label, date, identifier, file_type, classifier,stuff)
#!/usr/bin/python from lace import db, Archivetext, Ocrrun, Outputpage, Hocrtype, POSSIBLE_HOCR_VIEWS import glob, os, sys, tarfile from lace import get_absolute_textpath, APP_ROOT from populate_db import get_page_scores, collect_archive_text_info, get_runs, int_for_hocr_type_string, string_for_hocr_type_int DEBUG = True textpath = get_absolute_textpath('') db.create_all() page_count = 0 if len(sys.argv) < 2: sys.exit('Usage: %s archive_id' % sys.argv[0]) t = db.session.query(Archivetext).filter_by(archive_number=sys.argv[1]).first() print t if not t: print "there is no archive id " + sys.argv[1] + " in lace" else: db.session.delete(t) print "deleting",t runs = db.session.query(Ocrrun).filter_by(archivetext_id = t.id) for run in runs: print "deleting", run print run db.session.delete(run) hocr_types = db.session.query(Hocrtype).filter_by(ocrrun_id = run.id) for hocr_type in hocr_types: print "deleting",hocr_type db.session.delete(hocr_type) pages = db.session.query(Outputpage).filter_by(hocrtype_id = hocr_type.id) print "and", pages.count(), "pages" for page in pages: print "deleting page", page
except Exception as e: print e print "done downloading. Extracting ..." try: tar = tarfile.open(name=tar_file_name, mode="r:gz") except Exception as e: print ("ERROR file %s not a valid tar.gz file." % image_tar_url) print e try: tar.extractall(path=get_image_dir_path()) except Exception as e: print ("ERROR couldn't put images in %s." % get_image_dir_path()) print e textpath = get_absolute_textpath("") db.create_all() page_count = 0 if len(sys.argv) < 2: sys.exit("Usage: %s tar-gzip-file" % sys.argv[0]) for file_in in sys.argv[1:]: if not os.path.isfile(file_in): sys.exit("ERROR: tar file %s was not found!" % file_in) try: tar = tarfile.open(name=file_in, mode="r:gz") except: print ("ERROR file %s not a valid tar.gz file." % file_in) continue (route, file_name) = os.path.split(file_in) print file_name # (name_label, date, identifier, file_type, classifier,stuff)