def term_group(environ, start_response): status = "200 OK" headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace("scripts", "")) db = DB(config.db_path + "/data/") request = WSGIHandler(environ, config) if not request["q"]: dump = json.dumps({"original_query": "", "term_groups": []}) else: hits = db.query( request["q"], request["method"], request["arg"], sort_order=request["sort_order"], **request.metadata ) parsed = parse_query(request.q) group = group_terms(parsed) all_groups = split_terms(group) term_groups = [] for g in all_groups: term_group = "" not_started = False for kind, term in g: if kind == "NOT": if not_started is False: not_started = True term_group += " NOT " elif kind == "OR": term_group += "|" elif kind == "TERM": term_group += " %s " % term elif kind == "QUOTE": term_group += " %s " % term term_group = term_group.strip() term_groups.append(term_group) dump = json.dumps({"term_groups": term_groups, "original_query": request.original_q}) yield dump.encode("utf8")
def bibliography_results(request, config): """Fetch bibliography results""" db = DB(config.db_path + "/data/") if request.no_metadata: hits = db.get_all(db.locals["default_object_level"], request["sort_order"]) else: hits = db.query(sort_order=request["sort_order"], **request.metadata) if ( request.simple_bibliography == "all" ): # request from simple landing page report which gets all biblio in load order hits.finish() start = 1 end = len(hits) page_num = end else: start, end, page_num = page_interval(request.results_per_page, hits, request.start, request.end) bibliography_object = { "description": {"start": start, "end": end, "n": page_num, "results_per_page": request.results_per_page}, "query": dict([i for i in request]), "default_object": db.locals["default_object_level"], } results = [] result_type = "doc" for hit in hits[start - 1 : end]: citation_hrefs = citation_links(db, config, hit) metadata_fields = {} for metadata in db.locals["metadata_fields"]: metadata_fields[metadata] = hit[metadata] result_type = hit.object_type if request.simple_bibliography == "all": citation = citations(hit, citation_hrefs, config, report="simple_landing") else: citation = citations(hit, citation_hrefs, config, report="bibliography", result_type=result_type) if config.dictionary_bibliography is False or result_type == "doc": results.append( { "citation": citation, "citation_links": citation_hrefs, "philo_id": hit.philo_id, "metadata_fields": metadata_fields, "object_type": result_type, } ) else: context = get_text_obj(hit, config, request, db.locals["token_regex"], images=False) results.append( { "citation": citation, "citation_links": citation_hrefs, "philo_id": hit.philo_id, "metadata_fields": metadata_fields, "context": context, "object_type": result_type, } ) bibliography_object["results"] = results bibliography_object["results_length"] = len(hits) bibliography_object["query_done"] = hits.done bibliography_object["result_type"] = result_type return bibliography_object, hits
def kwic_results(request, config): """Fetch KWIC results""" db = DB(config.db_path + "/data/") hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) start, end, n = page_interval(request.results_per_page, hits, request.start, request.end) kwic_object = { "description": { "start": start, "end": end, "results_per_page": request.results_per_page }, "query": dict([i for i in request]), } kwic_object["results"] = [] for hit in hits[start - 1:end]: kwic_result = kwic_hit_object(hit, config, db) kwic_object["results"].append(kwic_result) kwic_object["results_length"] = len(hits) kwic_object["query_done"] = hits.done return kwic_object
def lookup_word_service(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) cursor = db.dbh.cursor() if request.report == "concordance": hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) context_size = config['concordance_length'] * 3 hit = hits[int(request.position)] bytes = hit.bytes hit_span = hit.bytes[-1] - hit.bytes[0] length = context_size + hit_span + context_size bytes, start_byte = adjust_bytes(bytes, length) end_byte = start_byte + length filename = hit.filename token = request.selected elif request.report == "navigation": token = request.selected philo_id = request.philo_id.split(" ") text_obj = db[philo_id] start_byte, end_byte = int(text_obj.start_byte), int(text_obj.end_byte) filename = text_obj.filename # print >> sys.stderr, "WORD LOOKUP FROM NAVIGATION", request.philo_id,request.selected, start_byte, end_byte, filename else: pass # print >> sys.stderr, "TOKEN", token, "BYTES: ", start_byte, end_byte, "FILENAME: ", filename, "POSITION", request.position token_n = 0 yield lookup_word(db, cursor, token, token_n, start_byte, end_byte, filename).encode('utf8')
def landing_page_bibliography(request, config): db = DB(config.db_path + "/data/") object_level = request.object_level if object_level and object_level in ["doc", "div1", "div2", "div3"]: hits = db.get_all(object_level) else: hits = db.get_all(db.locals["default_object_level"]) results = [] c = db.dbh.cursor() for hit in hits: hit_object = {} for field in db.locals["metadata_fields"]: hit_object[field] = hit[field] or "" if object_level == "doc": hit_object["philo_id"] = hit.philo_id[0] else: hit_object["philo_id"] = "/".join([str(i) for i in hit.philo_id]) doc_id = str(hit.philo_id[0]) + " 0 0 0 0 0 0" next_doc_id = str(hit.philo_id[0] + 1) + " 0 0 0 0 0 0" c.execute('select rowid from toms where philo_id="%s"' % doc_id) doc_row = c.fetchone()["rowid"] c.execute('select rowid from toms where philo_id="%s"' % next_doc_id) try: next_doc_row = c.fetchone()["rowid"] except TypeError: # if this is the last doc, just get the last rowid in the table. c.execute("select max(rowid) from toms;") next_doc_row = c.fetchone()[0] try: c.execute( 'select * from toms where rowid between %d and %d and head is not null and head !="" limit 1' % (doc_row, next_doc_row)) except sqlite3.OperationalError: # no type field in DB c.execute( 'select * from toms where rowid between ? and ? and head is not null and head !="" limit 1', (doc_row, next_doc_row), ) try: start_head = c.fetchone()["head"].decode("utf-8") start_head = start_head.lower().title().encode("utf-8") except Exception as e: print(repr(e), file=sys.stderr) start_head = "" try: c.execute( 'select head from toms where rowid between %d and %d and head is not null and head !="" order by rowid desc limit 1' % (doc_row, next_doc_row)) except sqlite3.OperationalError: # no type field in DB c.execute( 'select head from toms where rowid between %d and %d and head is not null and head !="" order by rowid desc limit 1' % (doc_row, next_doc_row)) try: end_head = c.fetchone()["head"] end_head = end_head.decode("utf-8").lower().title().encode("utf-8") except: end_head = "" hit_object["start_head"] = start_head hit_object["end_head"] = end_head results.append(hit_object) return results
def get_start_end_date(environ, start_response): status = "200 OK" headers = [("Content-type", "text/html; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace("scripts", "")) db = DB(config.db_path + "/data/") request = WSGIHandler(environ, config) start_date, end_date = start_end_date(db, config, start_date=request.start_date, end_date=request.end_date) request.metadata["year"] = "{}-{}".format(start_date, end_date) request["start_date"] = "" request["end_date"] = "" hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) total_results = 0 hits.finish() total_results = len(hits) yield json.dumps({ "start_date": start_date, "end_date": end_date, "total_results": total_results }).encode("utf8")
def term_list(environ, start_response): status = "200 OK" headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace("scripts", "")) db = DB(config.db_path + "/data/") request = WSGIHandler(environ, config) hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) hits.finish() expanded_terms = get_expanded_query(hits) yield json.dumps(expanded_terms[0]).encode("utf8")
def generate_word_frequency(request, config): """reads through a hitlist. looks up request["field"] in each hit, and builds up a list of unique values and their frequencies.""" db = DB(config.db_path + "/data/") hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) field = request["field"] counts = {} frequency_object = {} start_time = timeit.default_timer() last_hit_done = request.start try: for n in hits[request.start:]: key = get_word_attrib(n, field, db) if not key: # NULL is a magic value for queries, don't change it # recklessly. key = "NULL" if key not in counts: counts[key] = 0 counts[key] += 1 elapsed = timeit.default_timer() - start_time last_hit_done += 1 if elapsed > 5: break table = {} for k, v in counts.items(): url = make_absolute_query_link( config, request, start="0", end="0", report="word_property_filter", word_property=field, word_property_value=k, ) table[k] = {"count": v, "url": url} frequency_object["results"] = table frequency_object["hits_done"] = last_hit_done if last_hit_done == len(hits): frequency_object["more_results"] = False else: frequency_object["more_results"] = True except IndexError: frequency_object["results"] = {} frequency_object["more_results"] = False frequency_object["results_length"] = len(hits) frequency_object["query"] = dict([i for i in request]) return frequency_object
def generate_word_frequency(request, config): """reads through a hitlist. looks up request["field"] in each hit, and builds up a list of unique values and their frequencies.""" db = DB(config.db_path + "/data/") hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) field = request["field"] counts = {} frequency_object = {} start_time = timeit.default_timer() last_hit_done = request.start try: for n in hits[request.start :]: key = get_word_attrib(n, field, db) if not key: # NULL is a magic value for queries, don't change it # recklessly. key = "NULL" if key not in counts: counts[key] = 0 counts[key] += 1 elapsed = timeit.default_timer() - start_time last_hit_done += 1 if elapsed > 5: break table = {} for k, v in counts.items(): url = make_absolute_query_link( config, request, start="0", end="0", report="word_property_filter", word_property=field, word_property_value=k, ) table[k] = {"count": v, "url": url} frequency_object["results"] = table frequency_object["hits_done"] = last_hit_done if last_hit_done == len(hits): frequency_object["more_results"] = False else: frequency_object["more_results"] = True except IndexError: frequency_object["results"] = {} frequency_object["more_results"] = False frequency_object["results_length"] = len(hits) frequency_object["query"] = dict([i for i in request]) return frequency_object
def term_list(environ, start_response): status = "200 OK" headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace("scripts", "")) db = DB(config.db_path + "/data/") request = WSGIHandler(environ, config) hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) hits.finish() expanded_terms = get_expanded_query(hits) yield json.dumps(expanded_terms[0]).encode("utf8")
def get_more_context(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) hit_num = int(request.hit_num) hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) context_size = config['concordance_length'] * 3 hit_context = get_concordance_text(db, hits[hit_num], config.db_path, context_size) yield json.dumps(hit_context).encode('utf8')
def concordance_results(request, config): """Fetch concordances results.""" db = DB(config.db_path + "/data/") if request.collocation_type: first_hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) second_hits = db.query(request["left"], request["method"], request["arg"], **request.metadata) hits = CombinedHitlist(first_hits, second_hits) else: hits = db.query( request["q"], request["method"], request["arg"], sort_order=request["sort_order"], **request.metadata ) start, end, page_num = page_interval(request["results_per_page"], hits, request.start, request.end) concordance_object = { "description": {"start": start, "end": end, "results_per_page": request.results_per_page}, "query": dict([i for i in request]), "default_object": db.locals["default_object_level"], } formatting_regexes = [] if config.concordance_formatting_regex: for pattern, replacement in config.concordance_formatting_regex: compiled_regex = re.compile(r"%s" % pattern) formatting_regexes.append((compiled_regex, replacement)) results = [] for hit in hits[start - 1 : end]: citation_hrefs = citation_links(db, config, hit) metadata_fields = {} for metadata in db.locals["metadata_fields"]: metadata_fields[metadata] = hit[metadata] citation = citations(hit, citation_hrefs, config, report="concordance") context = get_concordance_text(db, hit, config.db_path, config.concordance_length) if formatting_regexes: for formatting_regex, replacement in formatting_regexes: context = formatting_regex.sub(r"%s" % replacement, context) result_obj = { "philo_id": hit.philo_id, "citation": citation, "citation_links": citation_hrefs, "context": context, "metadata_fields": metadata_fields, "bytes": hit.bytes, } results.append(result_obj) concordance_object["results"] = results concordance_object["results_length"] = len(hits) concordance_object["query_done"] = hits.done return concordance_object
def get_start_end_date(environ, start_response): status = "200 OK" headers = [("Content-type", "text/html; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace("scripts", "")) db = DB(config.db_path + "/data/") request = WSGIHandler(environ, config) start_date, end_date = start_end_date(db, config, start_date=request.start_date, end_date=request.end_date) request.metadata["year"] = "{}-{}".format(start_date, end_date) request["start_date"] = "" request["end_date"] = "" hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) total_results = 0 hits.finish() total_results = len(hits) yield json.dumps({"start_date": start_date, "end_date": end_date, "total_results": total_results}).encode("utf8")
def get_all_page_images(philo_id, config, current_obj_imgs): """Get all page images""" if current_obj_imgs[0]: # We know there are images db = DB(config.db_path + "/data/") cursor = db.dbh.cursor() approx_id = str(philo_id[0]) + " 0 0 0 0 0 0 %" try: cursor.execute( 'select * from pages where philo_id like ? and facs is not null and facs != ""', (approx_id, )) current_obj_imgs = set(current_obj_imgs) all_imgs = [tuple(i["facs"].split()) for i in cursor] except sqlite3.OperationalError: all_imgs = [] if not all_imgs: try: cursor.execute( 'select * from pages where philo_id like ? and id is not null and id != ""', (approx_id, )) current_obj_imgs = set(current_obj_imgs) all_imgs = [tuple(i["id"].split()) for i in cursor] except sqlite3.OperationalError: return [] return all_imgs else: return []
def group_by_metadata(request, config): citation_types = json.loads(request.citation) db = DB(config.db_path + "/data/") cursor = db.dbh.cursor() query = """select * from toms where philo_type="doc" and %s=?""" % request.group_by_field cursor.execute(query, (request.query, )) result_group = [] for doc in cursor: obj = db[doc["philo_id"]] links = citation_links(db, config, obj) citation = citations(obj, links, config, report="landing_page", citation_type=citation_types) result_group.append({ "metadata": get_all_metadata(db, doc), "citation": citation }) return json.dumps({ "display_count": request.display_count, "content_type": request.group_by_field, "content": [{ "prefix": request.query, "results": result_group }], })
def get_more_context(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) hit_num = int(request.hit_num) hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) context_size = config['concordance_length'] * 3 hit_context = get_concordance_text(db, hits[hit_num], config.db_path, context_size) yield json.dumps(hit_context).encode('utf8')
def kwic_results(request, config): """Fetch KWIC results""" db = DB(config.db_path + "/data/") hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) start, end, n = page_interval(request.results_per_page, hits, request.start, request.end) kwic_object = { "description": {"start": start, "end": end, "results_per_page": request.results_per_page}, "query": dict([i for i in request]), } kwic_object["results"] = [] for hit in hits[start - 1 : end]: kwic_result = kwic_hit_object(hit, config, db) kwic_object["results"].append(kwic_result) kwic_object["results_length"] = len(hits) kwic_object["query_done"] = hits.done return kwic_object
def get_notes(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) text_object = generate_text_object(request, config, note=True) yield json.dumps(text_object).encode('utf8')
def alignment_to_text(environ, start_response): status = "200 OK" headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace("scripts", "")) db = DB(config.db_path + "/data/") request = WSGIHandler(environ, config) link = byte_range_to_link(db, config, request) yield dumps({"link": link}).encode("utf8")
def retrieve_similar_docs(philo_db: str, philo_id: str, num: int = 20): philo_id = philo_id.strip() try: annoy_id = int(PHILO_ID_TO_ANNOY[philo_db][philo_id]) except KeyError: db = DB(f"{PHILO_PATHS[philo_db]}/data") hit = db[philo_id] text = get_text(hit, hit.start_byte, hit.end_byte - hit.start_byte, PHILO_PATHS[philo_db],) return submit_passage(text.decode("utf8"), num=num) newsims = INDEX.get_nns_by_item(annoy_id, num + 1, include_distances=True) results = process_annoy_results(newsims) return results[1:]
def get_total_results(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) if request.no_q: if request.no_metadata: hits = db.get_all(db.locals['default_object_level'], request["sort_order"]) else: hits = db.query(sort_order=request["sort_order"], **request.metadata) else: hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) total_results = 0 hits.finish() total_results = len(hits) yield json.dumps(total_results).encode('utf8')
def metadata_list(environ, start_response): status = "200 OK" headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace("scripts", "")) db = DB(config.db_path + "/data/") request = WSGIHandler(environ, config) metadata = request.term field = request.field yield autocomplete_metadata(metadata, field, db).encode("utf8")
def get_all_graphics(philo_id, config): db = DB(config.db_path + "/data/") cursor = db.dbh.cursor() approx_id = str(philo_id[0]) + " 0 0 0 0 0 0 %" try: cursor.execute( 'SELECT facs FROM graphics WHERE philo_id LIKE ? AND facs IS NOT NULL AND facs != "" ORDER BY ROWID', (approx_id, ), ) graphics = [i["facs"].split() for i in cursor if i["facs"]] return graphics except sqlite3.OperationalError: return []
def lookup_word_service(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) cursor = db.dbh.cursor() if request.report == "concordance": hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) context_size = config['concordance_length'] * 3 hit = hits[int(request.position)] bytes = hit.bytes hit_span = hit.bytes[-1] - hit.bytes[0] length = context_size + hit_span + context_size bytes, start_byte = adjust_bytes(bytes, length) end_byte = start_byte + length filename = hit.filename token = request.selected elif request.report == "navigation": token = request.selected philo_id = request.philo_id.split(" ") text_obj = db[philo_id] start_byte, end_byte = int(text_obj.start_byte), int(text_obj.end_byte) filename = text_obj.filename # print >> sys.stderr, "WORD LOOKUP FROM NAVIGATION", request.philo_id,request.selected, start_byte, end_byte, filename else: pass # print >> sys.stderr, "TOKEN", token, "BYTES: ", start_byte, end_byte, "FILENAME: ", filename, "POSITION", request.position token_n = 0 yield lookup_word(db, cursor, token, token_n, start_byte, end_byte, filename).encode('utf8')
def time_series_tester(config): db = DB(config.db_path + "/data/") c = db.dbh.cursor() try: c.execute("SELECT COUNT(*) FROM toms WHERE %s IS NOT NULL" % config.time_series_year_field) count = c.fetchone()[0] if count > 0: return True else: return False except sqlite3.OperationalError: return False
def term_list(environ, start_response): status = "200 OK" headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace("scripts", "")) db = DB(config.db_path + "/data/") request = WSGIHandler(environ, config) term = request.term if isinstance(term, list): term = term[-1] all_words = format_query(term, db, config)[:100] yield json.dumps(all_words).encode("utf8")
def get_text_object(environ, start_response): status = "200 OK" headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace("scripts", "")) db = DB(config.db_path + "/data/") request = WSGIHandler(environ, config) path = config.db_path zeros = 7 - len(request.philo_id) if zeros: request.philo_id += zeros * " 0" obj = ObjectWrapper(request["philo_id"].split(), db) text_object = generate_text_object(request, config) yield json.dumps(text_object).encode("utf8")
def get_total_results(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) if request.no_q: if request.no_metadata: hits = db.get_all(db.locals['default_object_level'], request["sort_order"]) else: hits = db.query(sort_order=request["sort_order"], **request.metadata) else: hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) total_results = 0 hits.finish() total_results = len(hits) yield json.dumps(total_results).encode('utf8')
def login_access(environ, request, config, headers): db = DB(config.db_path + "/data/") if request.authenticated: access = True else: if request.username and request.password: access = check_login_info(config, request) if access: incoming_address = environ["REMOTE_ADDR"] token = make_token(incoming_address, db) if token: h, ts = token headers.append(("Set-Cookie", "hash=%s" % h)) headers.append(("Set-Cookie", "timestamp=%s" % ts)) else: access = False return access, headers
def get_sorted_kwic(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') input_object = json.loads(environ['wsgi.input'].read().decode( 'utf8', 'ignore')) all_results = input_object['results'] query_string = input_object['query_string'] sort_keys = [i for i in input_object["sort_keys"] if i] environ['QUERY_STRING'] = query_string request = WSGIHandler(environ, config) sorted_hits = get_sorted_hits(all_results, sort_keys, request, config, db, input_object['start'], input_object['end']) yield json.dumps(sorted_hits).encode('utf8')
def get_first_page(philo_id, config): """This function will fetch the first page of any given text object in case there's no <pb> starting the object""" db = DB(config.db_path + "/data/") c = db.dbh.cursor() if len(philo_id) < 9: c.execute("select start_byte, end_byte from toms where philo_id=?", (" ".join([str(i) for i in philo_id]), )) result = c.fetchone() start_byte = result["start_byte"] approx_id = str(philo_id[0]) + " 0 0 0 0 0 0 %" try: c.execute( "select * from pages where philo_id like ? and end_byte >= ? limit 1", (approx_id, start_byte)) except: return {"filename": "", "start_byte": ""} else: c.execute("select * from pages where philo_id like ? limit 1", (" ".join([str(i) for i in philo_id]), )) page_result = c.fetchone() try: filename = page_result["facs"] except (IndexError, TypeError): filename = "" if not filename: try: filename = page_result["id"] or "" except (IndexError, TypeError): pass try: n = page_result["n"] or "" page = { "filename": filename.split(), "n": n, "start_byte": page_result["start_byte"], "end_byte": page_result["end_byte"], } return page except: # Let's play it safe return {"filename": "", "start_byte": ""}
def main(object_level, db_path): metadata_fields = {} doc_filenames = {} database = DB(os.path.join(db_path, "data")) cursor = database.dbh.cursor() cursor.execute( "SELECT philo_id, filename FROM toms WHERE philo_type='doc'") for philo_id, filename in cursor: doc_id = philo_id.split()[0] doc_filenames[doc_id] = filename cursor.execute("SELECT * FROM toms WHERE philo_type=?", (object_level, )) for result in cursor: fields = result philo_id = "_".join( fields["philo_id"].split()[:object_levels[object_level]]) metadata_fields[philo_id] = {} for field in database.locals["metadata_fields"]: metadata_fields[philo_id][field] = result[field] or "" doc_id = result["philo_id"].split()[0] metadata_fields[philo_id]["filename"] = doc_filenames[doc_id] with open("metadata.json", "w") as metadata_file: json.dump(metadata_fields, metadata_file)
def process_annoy_results(newsims) -> List[Dict[str, Union[str, Dict[str, str]]]]: simscores = list(newsims[1]) matchdocs = list(newsims[0]) results: List[Dict[str, Union[str, Dict[str, str]]]] = [] db_cache = {philo_db: DB(f'{config["path"]}/data') for philo_db, config in APP_CONFIG["philoDBs"].items()} for doc, score in zip(matchdocs, simscores): doc_id = ANNOY_TO_PHILO_ID[str(doc)] hit = db_cache[doc_id["philo_db"]][doc_id["philo_id"]] results.append( { "philo_db": doc_id["philo_db"], "metadata": { "author": hit.author, "title": hit.title, "date": hit.year, "head": hit.head, "philo_id": doc_id["philo_id"], }, "score": score, } ) return results
def get_tei_header(request, config): path = config.db_path db = DB(path + "/data") obj = ObjectWrapper(request["philo_id"].split(), db) filename = path + "/data/TEXT/" + obj.filename parser = etree.XMLParser(remove_blank_text=True, recover=True) xml_tree = etree.parse(filename, parser) header = xml_tree.find("teiHeader") try: header_text = etree.tostring(header, pretty_print=True).decode("utf8") except TypeError as e: # workaround for when lxml doesn't find the header for whatever reason start = False header_text = "" with open(filename, encoding="utf8") as file: file_content = file.read() try: start_header_index = re.search(r"<teiheader", file_content, re.I).start() end_header_index = re.search(r"</teiheader>", file_content, re.I).end() except AttributeError: # tag not found return "" header_text = file_content[start_header_index:end_header_index] return header_text.replace("<", "<").replace(">", ">")
def main(db_path): """Grab words from words table and dump to file""" philo_db = DB(db_path) words_and_ids_path = os.path.join(db_path, "words_and_philo_ids") status = os.system("mkdir -p %s" % words_and_ids_path) if status != 0: print( "Could not create %s. Please check your write permissions to the parent directory" % words_and_ids_path) sys.exit(status) cursor = philo_db.dbh.cursor() cursor.execute( 'SELECT philo_name, philo_id, start_byte, end_byte from words') current_doc_id = "1" current_words = [] for word, philo_id, start_byte, end_byte in cursor: doc_id = philo_id.split()[0] word_obj = { "token": word, "position": philo_id, "start_byte": start_byte, "end_byte": end_byte } if doc_id != current_doc_id: with open(os.path.join(words_and_ids_path, current_doc_id), "w") as output: output.write("\n".join(current_words)) print("Processed document %s" % current_doc_id, flush=True) current_words = [] current_doc_id = doc_id current_words.append(json.dumps(word_obj)) if current_words: with open(os.path.join(words_and_ids_path, current_doc_id), "w") as output: output.write("\n".join(current_words)) print("Processed document %s" % current_doc_id, flush=True)
def collocation_results(request, config): """Fetch collocation results""" db = DB(config.db_path + "/data/") if request["collocate_distance"]: hits = db.query(request["q"], "proxy", int(request["collocate_distance"]), **request.metadata) else: hits = db.query(request["q"], "cooc", request["arg"], **request.metadata) hits.finish() collocation_object = {"query": dict([i for i in request])} try: collocate_distance = int(request["collocate_distance"]) except ValueError: # Getting an empty string since the keyword is not specificed in the URL collocate_distance = None if request.colloc_filter_choice == "nofilter": filter_list = [] else: filter_list = build_filter_list(request, config) collocation_object["filter_list"] = filter_list filter_list = set(filter_list) # Build list of search terms to filter out query_words = [] for group in get_expanded_query(hits): for word in group: word = word.replace('"', "") query_words.append(word) query_words = set(query_words) filter_list = filter_list.union(query_words) if request["collocate_distance"]: hits = db.query(request["q"], "proxy", int(request["collocate_distance"]), raw_results=True, **request.metadata) else: hits = db.query(request["q"], "proxy", request["arg"], raw_results=True, **request.metadata) hits.finish() stored_sentence_id = None stored_sentence_counts = defaultdict(int) sentence_hit_count = 1 hits_done = request.start or 0 max_time = request.max_time or 10 all_collocates = defaultdict(lambda: {"count": 0}) cursor = db.dbh.cursor() start_time = timeit.default_timer() try: for hit in hits[hits_done:]: word_id = " ".join([str(i) for i in hit[:6]]) + " " + str(hit[7]) query = """select parent, rowid from words where philo_id='%s' limit 1""" % word_id cursor.execute(query) result = cursor.fetchone() parent = result["parent"] if parent != stored_sentence_id: rowid = int(result["rowid"]) sentence_hit_count = 1 stored_sentence_id = parent stored_sentence_counts = defaultdict(int) if collocate_distance: begin_rowid = rowid - collocate_distance if begin_rowid < 0: begin_rowid = 0 end_rowid = rowid + collocate_distance row_query = """select philo_name from words where parent='%s' and rowid between %d and %d""" % ( parent, begin_rowid, end_rowid, ) else: row_query = """select philo_name from words where parent='%s'""" % (parent,) cursor.execute(row_query) for i in cursor: collocate = i["philo_name"] if collocate not in filter_list: stored_sentence_counts[collocate] += 1 else: sentence_hit_count += 1 for word in stored_sentence_counts: if stored_sentence_counts[word] < sentence_hit_count: continue all_collocates[word]["count"] += 1 hits_done += 1 elapsed = timeit.default_timer() - start_time # avoid timeouts by splitting the query if more than request.max_time (in # seconds) has been spent in the loop if elapsed > int(max_time): break except IndexError: collocation_object["hits_done"] = len(hits) collocation_object["collocates"] = all_collocates collocation_object["results_length"] = len(hits) if hits_done < collocation_object["results_length"]: collocation_object["more_results"] = True collocation_object["hits_done"] = hits_done else: collocation_object["more_results"] = False collocation_object["hits_done"] = collocation_object["results_length"] return collocation_object
def group_by_range(request_range, request, config): db = DB(config.db_path + "/data/") metadata_queried = request.group_by_field citation_types = json.loads(request.citation) is_date = False try: int(request_range[0]) int(request_range[1]) is_date = True except ValueError: pass cursor = db.dbh.cursor() if is_date: content_type = "date" query_range = set(range(int(request_range[0]), int(request_range[1]))) cursor.execute('select * from toms where philo_type="doc"') else: content_type = metadata_queried query_range = set( range(ord(request_range[0]), ord(request_range[1]) + 1)) # Ordinal avoids unicode issues... cursor.execute( 'select *, count(*) as count from toms where philo_type="doc" group by %s' % metadata_queried) try: cursor.execute( 'select *, count(*) as count from toms where philo_type="doc" group by %s' % metadata_queried) except sqlite3.OperationalError: return json.dumps({ "display_count": request.display_count, "content_type": content_type, "content": [] }) content = {} date_count = defaultdict(int) for doc in cursor: normalized_test_value = "" if doc[metadata_queried] is None: continue if is_date: try: initial = int(doc[metadata_queried]) test_value = initial date_count[initial] += 1 except: continue else: try: initial_letter = doc[metadata_queried][0].lower() except IndexError: # we have an empty string continue try: test_value = ord(initial_letter) normalized_test_value = ord("".join([ i for i in unicodedata.normalize("NFKD", initial_letter) if not unicodedata.combining(i) ])) except TypeError: continue initial = initial_letter.upper() # Are we within the range? if test_value in query_range or normalized_test_value in query_range: if normalized_test_value in query_range: initial = "".join([ i for i in unicodedata.normalize("NFKD", initial_letter) if not unicodedata.combining(i) ]).upper() obj = db[doc["philo_id"]] links = citation_links(db, config, obj) citation = citations(obj, links, config, report="landing_page", citation_type=citation_types) if initial not in content: content[initial] = [] if is_date: try: normalized_field = unaccent.smash_accents( doc["title"]).lower() except: normalized_field = None content[initial].append({ "metadata": get_all_metadata(db, doc), "citation": citation, "count": date_count[initial], "normalized": normalized_field, }) else: content[initial].append({ "metadata": get_all_metadata(db, doc), "citation": citation, "count": doc["count"], "normalized": unaccent.smash_accents(doc[metadata_queried]).lower(), }) results = [] for prefix, result_set in sorted(content.items(), key=itemgetter(0)): results.append({ "prefix": prefix, "results": sorted(result_set, key=lambda x: x["normalized"]) }) return json.dumps({ "display_count": request.display_count, "content_type": content_type, "content": results })
def landing_page_bibliography(request, config): db = DB(config.db_path + "/data/") object_level = request.object_level if object_level and object_level in ["doc", "div1", "div2", "div3"]: hits = db.get_all(object_level) else: hits = db.get_all(db.locals["default_object_level"]) results = [] c = db.dbh.cursor() for hit in hits: hit_object = {} for field in db.locals["metadata_fields"]: hit_object[field] = hit[field] or "" if object_level == "doc": hit_object["philo_id"] = hit.philo_id[0] else: hit_object["philo_id"] = "/".join([str(i) for i in hit.philo_id]) doc_id = str(hit.philo_id[0]) + " 0 0 0 0 0 0" next_doc_id = str(hit.philo_id[0] + 1) + " 0 0 0 0 0 0" c.execute('select rowid from toms where philo_id="%s"' % doc_id) doc_row = c.fetchone()["rowid"] c.execute('select rowid from toms where philo_id="%s"' % next_doc_id) try: next_doc_row = c.fetchone()["rowid"] except TypeError: # if this is the last doc, just get the last rowid in the table. c.execute("select max(rowid) from toms;") next_doc_row = c.fetchone()[0] try: c.execute( 'select * from toms where rowid between %d and %d and head is not null and head !="" limit 1' % (doc_row, next_doc_row) ) except sqlite3.OperationalError: # no type field in DB c.execute( 'select * from toms where rowid between ? and ? and head is not null and head !="" limit 1', (doc_row, next_doc_row), ) try: start_head = c.fetchone()["head"].decode("utf-8") start_head = start_head.lower().title().encode("utf-8") except Exception as e: print(repr(e), file=sys.stderr) start_head = "" try: c.execute( 'select head from toms where rowid between %d and %d and head is not null and head !="" order by rowid desc limit 1' % (doc_row, next_doc_row) ) except sqlite3.OperationalError: # no type field in DB c.execute( 'select head from toms where rowid between %d and %d and head is not null and head !="" order by rowid desc limit 1' % (doc_row, next_doc_row) ) try: end_head = c.fetchone()["head"] end_head = end_head.decode("utf-8").lower().title().encode("utf-8") except: end_head = "" hit_object["start_head"] = start_head hit_object["end_head"] = end_head results.append(hit_object) return results
def check_access(environ, config): db = DB(config.db_path + "/data/") incoming_address, match_domain = get_client_info(environ) if config.access_file: if os.path.isabs(config.access_file): access_file = config.access_file else: access_file = os.path.join(config.db_path, "data", config.access_file) if not os.path.isfile(access_file): print( f"ACCESS FILE DOES NOT EXIST. UNAUTHORIZED ACCESS TO: {incoming_address} from domain {match_domain}", file=sys.stderr, ) return () else: print("UNAUTHORIZED ACCESS TO: %s from domain %s" % (incoming_address, match_domain), file=sys.stderr) return () # Load access config file. If loading fails, don't grant access. try: access_config = load_module("access_config", access_file) except Exception as e: print("ACCESS ERROR", repr(e), file=sys.stderr) print("UNAUTHORIZED ACCESS TO: %s from domain %s" % (incoming_address, match_domain), file=sys.stderr) return () # Let's first check if the IP is local and grant access if it is. for ip_range in ip_ranges: if ip_range.search(incoming_address): return make_token(incoming_address, db) try: domain_list = set(access_config.domain_list) except: domain_list = [] try: allowed_ips = set([]) for ip in access_config.allowed_ips: split_numbers = ip.split(".") if len(split_numbers) == 4: if re.search(r"\d+-\d+", split_numbers[3]): for last_num in range( int(split_numbers[3].split("-")[0]), int(split_numbers[3].split("-")[1]) + 1): allowed_ips.add(".".join(split_numbers[:3]) + "." + str(last_num)) elif re.search(r"\d+-\A", split_numbers[3]): for last_num in range(int(split_numbers[3].split("-")[0]), 255): allowed_ips.add(".".join(split_numbers[:3]) + "." + str(last_num)) else: allowed_ips.add(ip) else: allowed_ips.add(ip) except Exception as e: print(repr(e), file=sys.stderr) allowed_ips = [] try: blocked_ips = set(access_config.blocked_ips) except: blocked_ips = [] if incoming_address not in blocked_ips: if match_domain in domain_list: return make_token(incoming_address, db) else: for domain in domain_list: if domain in match_domain: return make_token(incoming_address, db) for ip_range in allowed_ips: if re.search(r"^%s.*" % ip_range, incoming_address): print("PASS", file=sys.stderr) return make_token(incoming_address, db) # If no token returned, we block access. print("UNAUTHORIZED ACCESS TO: %s from domain %s" % (incoming_address, match_domain), file=sys.stderr) return ()
def get_neighboring_words(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) try: index = int(request.hits_done) except: index = 0 max_time = int(request.max_time) kwic_words = [] start_time = timeit.default_timer() hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) cursor = db.dbh.cursor() for hit in hits[index:]: word_id = ' '.join([str(i) for i in hit.philo_id]) query = 'select rowid, philo_name, parent from words where philo_id="%s" limit 1' % word_id cursor.execute(query) results = cursor.fetchone() highlighted_text = kwic_hit_object(hit, config, db)["highlighted_text"] highlighted_text = highlighted_text.translate(remove_punctuation_map) highlighted_text = highlighted_text.strip() result_obj = { "left": "", "right": "", "index": index, "q": highlighted_text } left_rowid = results["rowid"] - 10 right_rowid = results["rowid"] + 10 cursor.execute('select philo_name, philo_id from words where rowid between ? and ?', (left_rowid, results['rowid']-1)) result_obj["left"] = [] for i in cursor: result_obj["left"].append(i['philo_name']) result_obj["left"].reverse() result_obj["left"] = ' '.join(result_obj["left"]) cursor.execute('select philo_name, philo_id from words where rowid between ? and ?', (results['rowid']+1, right_rowid)) result_obj["right"] = [] for i in cursor: result_obj["right"].append(i['philo_name']) result_obj["right"] = ' '.join(result_obj["right"]) for metadata in config.kwic_metadata_sorting_fields: result_obj[metadata] = hit[metadata].lower() kwic_words.append(result_obj) index += 1 elapsed = timeit.default_timer() - start_time if elapsed > max_time: # avoid timeouts by splitting the query if more than 10 seconds has been spent in the loop break yield json.dumps({"results": kwic_words, "hits_done": index}).encode('utf8')
def generate_time_series(request, config): db = DB(config.db_path + "/data/") time_series_object = {"query": dict([i for i in request]), "query_done": False} # Invalid date range if request.start_date == "invalid" or request.end_date == "invalid": time_series_object["results_length"] = 0 time_series_object["more_results"] = False time_series_object["new_start_date"] = 0 time_series_object["results"] = {"absolute_count": {}, "date_count": {}} return time_series_object start_date, end_date = get_start_end_date( db, config, start_date=request.start_date or None, end_date=request.end_date or None ) # Generate date ranges interval = int(request.year_interval) date_ranges = [] # Make sure last date gets included in for loop below by adding one to last step for start in range(start_date, end_date + 1, interval): end = start + interval - 1 if end > end_date: end = end_date date_range = "%d-%d" % (start, end) date_ranges.append((start, date_range)) absolute_count = defaultdict(int) date_counts = {} total_hits = 0 last_date_done = start_date start_time = timeit.default_timer() max_time = request.max_time or 10 for start_range, date_range in date_ranges: request.metadata[config.time_series_year_field] = date_range hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata) hits.finish() hit_len = len(hits) params = {"report": "concordance", "start": "0", "end": "0"} params[config.time_series_year_field] = date_range url = make_absolute_query_link(config, request, **params) absolute_count[start_range] = {"label": start_range, "count": hit_len, "url": url} # Get date total count if interval != "1": end_range = start_range + (int(request["year_interval"]) - 1) query = 'select sum(word_count) from toms where %s between "%d" and "%d"' % ( config.time_series_year_field, start_range, end_range, ) else: query = "select sum(word_count) from toms where %s='%s'" % (config.time_series_year_field, start_range) cursor = db.dbh.cursor() cursor.execute(query) date_counts[start_range] = cursor.fetchone()[0] or 0 total_hits += hit_len elapsed = timeit.default_timer() - start_time last_date_done = start_range # avoid timeouts by splitting the query if more than request.max_time # (in seconds) has been spent in the loop if elapsed > int(max_time): break time_series_object["results_length"] = total_hits if (last_date_done + int(request.year_interval)) >= end_date: time_series_object["more_results"] = False else: time_series_object["more_results"] = True time_series_object["new_start_date"] = last_date_done + int(request.year_interval) time_series_object["results"] = {"absolute_count": absolute_count, "date_count": date_counts} return time_series_object
def filter_words_by_property(request, config): """Filter words by property""" db = DB(config.db_path + "/data/") hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) concordance_object = {"query": dict([i for i in request])} # Do these need to be captured in wsgi_handler? word_property = request["word_property"] word_property_value = request["word_property_value"] word_property_total = request["word_property_total"] new_hitlist = [] results = [] position = 0 more_pages = False if request.start == 0: start = 1 else: start = request.start for hit in hits: # get my chunk of text hit_val = get_word_attrib(hit, word_property, db) if hit_val == word_property_value: position += 1 if position < start: continue new_hitlist.append(hit) citation_hrefs = citation_links(db, config, hit) metadata_fields = {} for metadata in db.locals["metadata_fields"]: metadata_fields[metadata] = hit[metadata] citation = citations(hit, citation_hrefs, config) context = get_concordance_text(db, hit, config.db_path, config.concordance_length) result_obj = { "philo_id": hit.philo_id, "citation": citation, "citation_links": citation_hrefs, "context": context, "metadata_fields": metadata_fields, "bytes": hit.bytes, "collocate_count": 1, } results.append(result_obj) if len(new_hitlist) == (request.results_per_page): more_pages = True break end = start + len(results) - 1 if len(results) < request.results_per_page: word_property_total = end else: word_property_total = end + 1 concordance_object["results"] = results concordance_object["query_done"] = hits.done concordance_object["results_length"] = word_property_total concordance_object["description"] = { "start": start, "end": end, "results_per_page": request.results_per_page, "more_pages": more_pages, } return concordance_object