def term_group(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) if not request["q"]: dump = json.dumps({"original_query": "", "term_groups": []}) else: hits = db.query(request["q"], request["method"], request["arg"], sort_order=request["sort_order"], **request.metadata) parsed = parse_query(request.q) group = group_terms(parsed) all_groups = split_terms(group) term_groups = [] for g in all_groups: term_group = '' not_started = False for kind, term in g: if kind == 'NOT': if not_started is False: not_started = True term_group += ' NOT ' elif kind == 'OR': term_group += '|' elif kind == "TERM": term_group += ' %s ' % term elif kind == "QUOTE": term_group += ' %s ' % term term_group = term_group.strip() term_groups.append(term_group) dump = json.dumps({"term_groups": term_groups, "original_query": request.original_q}) yield dump.encode('utf8')
def kwic_results(request, config): """Fetch KWIC results""" db = DB(config.db_path + '/data/') hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) start, end, n = page_interval(request.results_per_page, hits, request.start, request.end) kwic_object = { "description": { "start": start, "end": end, "results_per_page": request.results_per_page }, "query": dict([i for i in request]) } kwic_object['results'] = [] for hit in hits[start - 1:end]: kwic_result = kwic_hit_object(hit, config, db) kwic_object['results'].append(kwic_result) kwic_object['results_length'] = len(hits) kwic_object["query_done"] = hits.done return kwic_object
def lookup_word_service(environ, start_response): status = "200 OK" headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = f.WebConfig() db = DB(config.db_path + "/data/") request = WSGIHandler(db, environ) cursor = db.dbh.cursor() if request.report == "concordance": hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) context_size = config["concordance_length"] * 3 hit = hits[int(request.position)] bytes = hit.bytes hit_span = hit.bytes[-1] - hit.bytes[0] length = context_size + hit_span + context_size bytes, byte_start = adjust_bytes(bytes, length) byte_end = byte_start + length filename = hit.filename token = request.selected elif request.report == "navigation": token = request.selected philo_id = request.philo_id.split(" ") text_obj = db[philo_id] byte_start, byte_end = int(text_obj.byte_start), int(text_obj.byte_end) filename = text_obj.filename # print >> sys.stderr, "WORD LOOKUP FROM NAVIGATION", request.philo_id,request.selected, byte_start, byte_end, filename else: pass # print >> sys.stderr, "TOKEN", token, "BYTES: ", byte_start, byte_end, "FILENAME: ", filename, "POSITION", request.position token_n = 0 yield lookup_word(db, cursor, token, token_n, byte_start, byte_end, filename)
def lookup_word_service(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) cursor = db.dbh.cursor() if request.report == "concordance": hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) context_size = config['concordance_length'] * 3 hit = hits[int(request.position)] bytes = hit.bytes hit_span = hit.bytes[-1] - hit.bytes[0] length = context_size + hit_span + context_size bytes, start_byte = adjust_bytes(bytes, length) end_byte = start_byte + length filename = hit.filename token = request.selected elif request.report == "navigation": token = request.selected philo_id = request.philo_id.split(" ") text_obj = db[philo_id] start_byte, end_byte = int(text_obj.start_byte), int(text_obj.end_byte) filename = text_obj.filename # print >> sys.stderr, "WORD LOOKUP FROM NAVIGATION", request.philo_id,request.selected, start_byte, end_byte, filename else: pass # print >> sys.stderr, "TOKEN", token, "BYTES: ", start_byte, end_byte, "FILENAME: ", filename, "POSITION", request.position token_n = 0 yield lookup_word(db, cursor, token, token_n, start_byte, end_byte, filename)
def term_group(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = f.WebConfig() db = DB(config.db_path + '/data/') request = WSGIHandler(db, environ) hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) parsed = parse_query(request.q) group = group_terms(parsed) all_groups = split_terms(group) term_groups = [] for g in all_groups: term_group = '' not_started = False for kind, term in g: if kind == 'NOT': if not_started == False: not_started = True term_group += ' NOT ' elif kind == 'OR': term_group += '|' elif kind == "TERM": term_group += ' %s ' % term elif kind == "QUOTE": term_group += ' %s ' % term term_group = term_group.strip() term_groups.append(term_group) yield json.dumps(term_groups)
def get_metadata_token_count(environ,start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'),("Access-Control-Allow-Origin","*")] start_response(status,headers) config = f.WebConfig() db = DB(config.db_path + '/data/') request = WSGIHandler(db, environ) results = '' frequencies = json.loads(environ['wsgi.input'].read()) word_counts = [] c = db.dbh.cursor() count = 0 for label, m in frequencies.iteritems(): args = [] query_metadata = {} for metadata in m['metadata']: query_metadata[metadata] = m['metadata'][metadata].encode('utf-8') hits = db.query(**query_metadata) total_count = 0 for hit in hits: total_count += int(hit['word_count']) try: frequencies[label]['count'] = round(float(m['count']) / total_count * 1000000, 3) except: count += 1 frequencies[label]['count'] = 0 yield json.dumps(frequencies)
def landing_page_bibliography(request, config): db = DB(config.db_path + '/data/') object_level = request.object_level if object_level and object_level in ["doc", "div1", "div2", "div3"]: hits = db.get_all(object_level) else: hits = db.get_all(db.locals['default_object_level']) results = [] c = db.dbh.cursor() for hit in hits: hit_object = {} for field in db.locals['metadata_fields']: hit_object[field] = hit[field] or '' if object_level == "doc": hit_object['philo_id'] = hit.philo_id[0] else: hit_object['philo_id'] = '/'.join([str(i) for i in hit.philo_id]) doc_id = str(hit.philo_id[0]) + ' 0 0 0 0 0 0' next_doc_id = str(hit.philo_id[0] + 1) + ' 0 0 0 0 0 0' c.execute('select rowid from toms where philo_id="%s"' % doc_id) doc_row = c.fetchone()['rowid'] c.execute('select rowid from toms where philo_id="%s"' % next_doc_id) try: next_doc_row = c.fetchone()['rowid'] except TypeError: # if this is the last doc, just get the last rowid in the table. c.execute('select max(rowid) from toms;') next_doc_row = c.fetchone()[0] try: c.execute( 'select * from toms where rowid between %d and %d and head is not null and head !="" limit 1' % (doc_row, next_doc_row)) except sqlite3.OperationalError: # no type field in DB c.execute( 'select * from toms where rowid between ? and ? and head is not null and head !="" limit 1', (doc_row, next_doc_row)) try: start_head = c.fetchone()['head'].decode('utf-8') start_head = start_head.lower().title().encode('utf-8') except Exception as e: print(repr(e), file=sys.stderr) start_head = '' try: c.execute( 'select head from toms where rowid between %d and %d and head is not null and head !="" order by rowid desc limit 1' % (doc_row, next_doc_row)) except sqlite3.OperationalError: # no type field in DB c.execute( 'select head from toms where rowid between %d and %d and head is not null and head !="" order by rowid desc limit 1' % (doc_row, next_doc_row)) try: end_head = c.fetchone()['head'] end_head = end_head.decode('utf-8').lower().title().encode('utf-8') except: end_head = '' hit_object['start_head'] = start_head hit_object['end_head'] = end_head results.append(hit_object) return results
def bibliography_results(request, config): """Fetch bibliography results""" db = DB(config.db_path + '/data/') if request.no_metadata: hits = db.get_all(db.locals['default_object_level'], request["sort_order"]) else: hits = db.query(sort_order=request["sort_order"], **request.metadata) if request.simple_bibliography == "all": # request from simple landing page report which gets all biblio in load order hits.finish() start = 1 end = len(hits) page_num = end else: start, end, page_num = page_interval(request.results_per_page, hits, request.start, request.end) bibliography_object = { "description": { "start": start, "end": end, "n": page_num, "results_per_page": request.results_per_page }, "query": dict([i for i in request]), "default_object": db.locals['default_object_level'] } results = [] result_type = "doc" for hit in hits[start - 1:end]: citation_hrefs = citation_links(db, config, hit) metadata_fields = {} for metadata in db.locals['metadata_fields']: metadata_fields[metadata] = hit[metadata] result_type = hit.object_type if request.simple_bibliography == "all": citation = citations(hit, citation_hrefs, config, report="simple_landing") else: citation = citations(hit, citation_hrefs, config, report="bibliography", result_type=result_type) if config.dictionary_bibliography is False or result_type == "doc": results.append({ 'citation': citation, 'citation_links': citation_hrefs, 'philo_id': hit.philo_id, "metadata_fields": metadata_fields, "object_type": result_type }) else: context = get_text_obj(hit, config, request, db.locals["token_regex"], images=False) results.append({ 'citation': citation, 'citation_links': citation_hrefs, 'philo_id': hit.philo_id, "metadata_fields": metadata_fields, "context": context, "object_type": result_type }) bibliography_object["results"] = results bibliography_object['results_length'] = len(hits) bibliography_object['query_done'] = hits.done bibliography_object['result_type'] = result_type return bibliography_object, hits
def concordance_results(request, config): """Fetch concordances results.""" db = DB(config.db_path + '/data/') if request.collocation_type: first_hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) second_hits = db.query(request["left"], request["method"], request["arg"], **request.metadata) hits = CombinedHitlist(first_hits, second_hits) else: hits = db.query(request["q"], request["method"], request["arg"], sort_order=request["sort_order"], **request.metadata) start, end, page_num = page_interval(request['results_per_page'], hits, request.start, request.end) concordance_object = { "description": { "start": start, "end": end, "results_per_page": request.results_per_page }, "query": dict([i for i in request]), "default_object": db.locals['default_object_level'] } formatting_regexes = [] if config.concordance_formatting_regex: for pattern, replacement in config.concordance_formatting_regex: compiled_regex = re.compile(r'%s' % pattern) formatting_regexes.append((compiled_regex, replacement)) results = [] for hit in hits[start - 1:end]: citation_hrefs = citation_links(db, config, hit) metadata_fields = {} for metadata in db.locals['metadata_fields']: metadata_fields[metadata] = hit[metadata] citation = citations(hit, citation_hrefs, config, report="concordance") context = get_concordance_text(db, hit, config.db_path, config.concordance_length) if formatting_regexes: for formatting_regex, replacement in formatting_regexes: context = formatting_regex.sub(r'%s' % replacement, context) result_obj = { "philo_id": hit.philo_id, "citation": citation, "citation_links": citation_hrefs, "context": context, "metadata_fields": metadata_fields, "bytes": hit.bytes } results.append(result_obj) concordance_object["results"] = results concordance_object['results_length'] = len(hits) concordance_object["query_done"] = hits.done return concordance_object
def word_property_filter(environ,start_response): config = f.WebConfig() db = DB(config.db_path + '/data/') request = WSGIHandler(db, environ) headers = [('Content-type', 'application/json; charset=UTF-8'),("Access-Control-Allow-Origin","*")] start_response('200 OK',headers) hits = db.query(request["q"],request["method"],request["arg"],**request.metadata) filter_results = filter_words_by_property(hits, config.db_path, request, db, config) yield json.dumps(filter_results)
def collocation(environ,start_response): config = f.WebConfig() db = DB(config.db_path + '/data/') request = WSGIHandler(db, environ) headers = [('Content-type', 'application/json; charset=UTF-8'),("Access-Control-Allow-Origin","*")] start_response('200 OK',headers) hits = db.query(request["q"],"cooc",request["arg"],**request.metadata) hits.finish() collocation_object = fetch_collocation(hits, request, db, config) yield json.dumps(collocation_object)
def get_frequency(environ,start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'),("Access-Control-Allow-Origin","*")] start_response(status,headers) config = f.WebConfig() db = DB(config.db_path + '/data/') request = WSGIHandler(db, environ) hits = db.query(request["q"],request["method"],request["arg"],**request.metadata) field, word_frequency_object = generate_word_frequency(hits,request,db,config) yield json.dumps(word_frequency_object, indent=2)
def term_list(environ, start_response): status = "200 OK" headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = f.WebConfig() db = DB(config.db_path + "/data/") q = WSGIHandler(db, environ) hits = db.query(q["q"], q["method"], q["arg"], **q.metadata) expanded_terms = get_expanded_query(hits) yield json.dumps(expanded_terms[0])
def generate_word_frequency(request, config): """reads through a hitlist. looks up request["field"] in each hit, and builds up a list of unique values and their frequencies.""" db = DB(config.db_path + "/data/") hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) field = request["field"] counts = {} frequency_object = {} start_time = timeit.default_timer() last_hit_done = request.start try: for n in hits[request.start:]: key = get_word_attrib(n, field, db) if not key: # NULL is a magic value for queries, don't change it # recklessly. key = "NULL" if key not in counts: counts[key] = 0 counts[key] += 1 elapsed = timeit.default_timer() - start_time last_hit_done += 1 if elapsed > 5: break table = {} for k, v in counts.items(): url = make_absolute_query_link( config, request, start="0", end="0", report="word_property_filter", word_property=field, word_property_value=k, ) table[k] = {"count": v, "url": url} frequency_object["results"] = table frequency_object["hits_done"] = last_hit_done if last_hit_done == len(hits): frequency_object["more_results"] = False else: frequency_object["more_results"] = True except IndexError: frequency_object["results"] = {} frequency_object["more_results"] = False frequency_object["results_length"] = len(hits) frequency_object["query"] = dict([i for i in request]) return frequency_object
def get_more_context(environ, start_response): status = "200 OK" headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = f.WebConfig() db = DB(config.db_path + "/data/") request = WSGIHandler(db, environ) hit_num = int(request.hit_num) hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) context_size = config["concordance_length"] * 3 hit_context = r.fetch_concordance(db, hits[hit_num], config.db_path, context_size) yield json.dumps(hit_context)
def generate_word_frequency(request, config): """reads through a hitlist. looks up request["field"] in each hit, and builds up a list of unique values and their frequencies.""" db = DB(config.db_path + '/data/') hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) field = request["field"] counts = {} frequency_object = {} more_results = True start_time = timeit.default_timer() last_hit_done = request.start try: for n in hits[request.start:]: key = get_word_attrib(n, field, db) if not key: # NULL is a magic value for queries, don't change it # recklessly. key = "NULL" if key not in counts: counts[key] = 0 counts[key] += 1 elapsed = timeit.default_timer() - start_time last_hit_done += 1 if elapsed > 5: break table = {} for k, v in counts.iteritems(): url = make_absolute_query_link(config, request, start="0", end="0", report="word_property_filter", word_property=field, word_property_value=k) table[k] = {'count': v, 'url': url} frequency_object['results'] = table frequency_object["hits_done"] = last_hit_done if last_hit_done == len(hits): frequency_object['more_results'] = False else: frequency_object['more_results'] = True except IndexError: frequency_object['results'] = {} frequency_object['more_results'] = False frequency_object['results_length'] = len(hits) frequency_object['query'] = dict([i for i in request]) return frequency_object
def concordance_results(request, config): """Fetch concordances results.""" db = DB(config.db_path + '/data/') if request.collocation_type: first_hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) second_hits = db.query(request["left"], request["method"], request["arg"], **request.metadata) hits = CombinedHitlist(first_hits, second_hits) else: hits = db.query(request["q"], request["method"], request["arg"], sort_order=request["sort_order"], **request.metadata) start, end, page_num = page_interval(request['results_per_page'], hits, request.start, request.end) concordance_object = { "description": {"start": start, "end": end, "results_per_page": request.results_per_page}, "query": dict([i for i in request]), "default_object": db.locals['default_object_level'] } formatting_regexes = [] if config.concordance_formatting_regex: for pattern, replacement in config.concordance_formatting_regex: compiled_regex = re.compile(r'%s' % pattern) formatting_regexes.append((compiled_regex, replacement)) results = [] for hit in hits[start - 1:end]: citation_hrefs = citation_links(db, config, hit) metadata_fields = {} for metadata in db.locals['metadata_fields']: metadata_fields[metadata] = hit[metadata] citation = citations(hit, citation_hrefs, config, report="concordance") context = get_concordance_text(db, hit, config.db_path, config.concordance_length) if formatting_regexes: for formatting_regex, replacement in formatting_regexes: context = formatting_regex.sub(r'%s' % replacement, context) result_obj = { "philo_id": hit.philo_id, "citation": citation, "citation_links": citation_hrefs, "context": context, "metadata_fields": metadata_fields, "bytes": hit.bytes } results.append(result_obj) concordance_object["results"] = results concordance_object['results_length'] = len(hits) concordance_object["query_done"] = hits.done return concordance_object
def term_list(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) hits.finish() expanded_terms = get_expanded_query(hits) yield simplejson.dumps(expanded_terms[0])
def term_list(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = f.WebConfig() db = DB(config.db_path + '/data/') request = WSGIHandler(db, environ) hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) hits.finish() expanded_terms = get_expanded_query(hits) yield json.dumps(expanded_terms[0])
def term_list(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) hits.finish() expanded_terms = get_expanded_query(hits) yield json.dumps(expanded_terms[0]).encode('utf8')
def get_bibliography(environ,start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'),("Access-Control-Allow-Origin","*")] start_response(status,headers) config = f.WebConfig() db = DB(config.db_path + '/data/') request = WSGIHandler(db, environ) object_level = request.object_level if object_level and object_level in object_levels: hits = db.get_all(object_level) else: hits = db.get_all(db.locals['default_object_level']) results = [] c = db.dbh.cursor() for hit in hits: hit_object = {} for field in db.locals['metadata_fields']: hit_object[field] = hit[field] or '' if object_level == "doc": hit_object['philo_id'] = hit.philo_id[0] else: hit_object['philo_id'] = '/'.join([str(i) for i in hit.philo_id]) doc_id = str(hit.philo_id[0]) + ' 0 0 0 0 0 0' next_doc_id = str(hit.philo_id[0] + 1) + ' 0 0 0 0 0 0' c.execute('select rowid from toms where philo_id="%s"' % doc_id) doc_row = c.fetchone()['rowid'] c.execute('select rowid from toms where philo_id="%s"' % next_doc_id) try: next_doc_row = c.fetchone()['rowid'] except TypeError: # if this is the last doc, just get the last rowid in the table. c.execute('select max(rowid) from toms;') next_doc_row = c.fetchone()[0] c.execute('select head from toms where rowid between %d and %d and head is not null limit 1' % (doc_row, next_doc_row)) try: start_head = c.fetchone()['head'] start_head = start_head.decode('utf-8').lower().title().encode('utf-8') except: start_head = '' c.execute('select head from toms where rowid between %d and %d and head is not null order by rowid desc limit 1' % (doc_row, next_doc_row)) try: end_head = c.fetchone()['head'] end_head = end_head.decode('utf-8').lower().title().encode('utf-8') except: end_head = '' hit_object['start_head'] = start_head hit_object['end_head'] = end_head results.append(hit_object) yield json.dumps(results)
def get_more_context(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) hit_num = int(request.hit_num) hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) context_size = config['concordance_length'] * 3 hit_context = get_concordance_text(db, hits[hit_num], config.db_path, context_size) yield json.dumps(hit_context).encode('utf8')
def get_more_context(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) hit_num = int(request.hit_num) hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) context_size = config['concordance_length'] * 3 hit_context = get_concordance_text(db, hits[hit_num], config.db_path, context_size) yield simplejson.dumps(hit_context)
def get_first_page(philo_id, config): """This function will fetch the first page of any given text object in case there's no <pb> starting the object""" db = DB(config.db_path + '/data/') c = db.dbh.cursor() if len(philo_id) < 9: c.execute('select start_byte, end_byte from toms where philo_id=?', (' '.join([str(i) for i in philo_id]), )) result = c.fetchone() start_byte = result['start_byte'] approx_id = str(philo_id[0]) + ' 0 0 0 0 0 0 %' try: c.execute('select * from pages where philo_id like ? and end_byte >= ? limit 1', (approx_id, start_byte)) except: return {'filename': '', 'start_byte': ''} else: c.execute('select * from pages where philo_id like ? limit 1', (' '.join([str(i) for i in philo_id]), )) page_result = c.fetchone() try: filename = page_result["facs"] except (IndexError, TypeError): filename = "" if not filename: try: filename = page_result['id'] or '' except (IndexError, TypeError): pass try: n = page_result['n'] or '' page = {'filename': filename.split(), "n": n, 'start_byte': page_result['start_byte'], 'end_byte': page_result['end_byte']} return page except: # Let's play it safe return {'filename': '', 'start_byte': ''}
def get_frequency(environ,start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'),("Access-Control-Allow-Origin","*")] start_response(status,headers) config = f.WebConfig() db = DB(config.db_path + '/data/') request = WSGIHandler(db, environ) if request.q == '' and request.no_q: if request.no_metadata: hits = db.get_all(db.locals['default_object_level']) else: hits = db.query(**request.metadata) else: hits = db.query(request["q"],request["method"],request["arg"],**request.metadata) results = r.generate_frequency(hits, request, db, config) yield json.dumps(results)
def search_examples(field): path = os.path.abspath(os.path.dirname(__file__)).replace('functions', "") + '/data/' db = DB(path,encoding='utf-8') if field == "word": word_path = path + '/frequencies/word_frequencies' word = '' for n,line in enumerate(open(word_path)): word = line.split()[0] if n == 100: break return word.decode('utf-8', 'ignore') else: c = db.dbh.cursor() object_type = db.locals['metadata_types'][field] try: if object_type != 'div': c.execute('select %s from toms where philo_type="%s" and %s!="" limit 1' % (field, object_type, field)) else: c.execute('select %s from toms where philo_type="div1" or philo_type="div2" or philo_type="div3" and %s!="" limit 1' % (field, field)) except sqlite3.OperationalError: example = '' try: example = c.fetchone()[0].decode('utf-8', 'ignore') except (TypeError, AttributeError): example = '' return example
def main(db_path): """Grab words from words table and dump to file""" philo_db = DB(db_path) words_and_ids_path = os.path.join(db_path, "words_and_philo_ids") status = os.system("mkdir -p %s" % words_and_ids_path) if status != 0: print("Could not create %s. Please check your write permissions to the parent directory" % words_and_ids_path) sys.exit(status) cursor = philo_db.dbh.cursor() cursor.execute('SELECT philo_name, philo_id, start_byte, end_byte from words') current_doc_id = "1" current_words = [] for word, philo_id, start_byte, end_byte in cursor: doc_id = philo_id.split()[0] word_obj = { "token": word, "position": philo_id, "start_byte": start_byte, "end_byte": end_byte } if doc_id != current_doc_id: with open(os.path.join(words_and_ids_path, current_doc_id), "w") as output: output.write("\n".join(current_words)) print("Processed document %s" % current_doc_id, flush=True) current_words = [] current_doc_id = doc_id current_words.append(json.dumps(word_obj)) if current_words: with open(os.path.join(words_and_ids_path, current_doc_id), "w") as output: output.write("\n".join(current_words)) print("Processed document %s" % current_doc_id, flush=True)
def get_all_page_images(philo_id, config, current_obj_imgs): """Get all page images""" if current_obj_imgs[0]: # We know there are images db = DB(config.db_path + '/data/') c = db.dbh.cursor() approx_id = str(philo_id[0]) + ' 0 0 0 0 0 0 %' try: c.execute( 'select * from pages where philo_id like ? and facs is not null and facs != ""', (approx_id, )) current_obj_imgs = set(current_obj_imgs) all_imgs = [tuple(i["facs"].split()) for i in c.fetchall()] except sqlite3.OperationalError: all_imgs = [] if not all_imgs: try: c.execute( 'select * from pages where philo_id like ? and id is not null and id != ""', (approx_id, )) current_obj_imgs = set(current_obj_imgs) all_imgs = [tuple(i["id"].split()) for i in c.fetchall()] except sqlite3.OperationalError: return [] return all_imgs else: return []
def resolve_cite_service(environ, start_response): config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) c = db.dbh.cursor() q = request.q best_url = config['db_url'] if " - " in q: milestone = q.split(" - ")[0] else: milestone = q milestone_segments = [] last_segment = 0 milestone_prefixes = [] for separator in re.finditer(r' (?!\.)|\.(?! )', milestone): milestone_prefixes += [milestone[:separator.start()]] milestone_segments += [milestone[last_segment:separator.start()]] last_segment = separator.end() milestone_segments += [milestone[last_segment:]] milestone_prefixes += [milestone] print >> sys.stderr, "SEGMENTS", repr(milestone_segments) print >> sys.stderr, "PREFIXES", repr(milestone_prefixes) abbrev_match = None for pos, v in enumerate(milestone_prefixes): print >> sys.stderr, "QUERYING for abbrev = ", v abbrev_q = c.execute("SELECT * FROM toms WHERE abbrev = ?;", (v, )).fetchone() if abbrev_q: abbrev_match = abbrev_q print >> sys.stderr, "ABBREV", abbrev_match["abbrev"], abbrev_match["philo_id"] doc_obj = ObjectWrapper(abbrev_match['philo_id'].split(), db) nav = nav_query(doc_obj, db) best_match = None for n in nav: if n["head"] == request.q: print >> sys.stderr, "MATCH", n["philo_id"], n["n"], n["head"] best_match = n break if best_match: type_offsets = {"doc": 1, "div1": 2, "div2": 3, "div3": 4, "para": 5} t = best_match['philo_type'] short_id = best_match["philo_id"].split()[:type_offsets[t]] best_url = f.make_absolute_object_link(config, short_id) print >> sys.stderr, "BEST_URL", best_url status = '302 Found' redirect = config['db_url'] headers = [('Location', best_url)] start_response(status, headers) return ""
def generate_text_object(request, config, note=False): """Return text object given an philo_id""" # verify this isn't a page ID or if this is a note if len(request.philo_id.split()) == 9 and note is not True: width = 9 else: width = 7 db = DB(config.db_path + '/data/', width=width) if note: target = request.target.replace('#', '') doc_id = request.philo_id.split()[0] + ' %' c = db.dbh.cursor() c.execute( 'select philo_id from toms where id=? and philo_id like ? limit 1', (target, doc_id)) philo_id = c.fetchall()[0]['philo_id'].split()[:7] obj = db[philo_id] else: try: obj = db[request.philo_id] except ValueError: obj = db[' '.join(request.path_components)] philo_id = obj.philo_id if width != 9: while obj['philo_name'] == '__philo_virtual' and obj[ "philo_type"] != "div1": philo_id.pop() obj = db[philo_id] philo_id = list(obj.philo_id) while int(philo_id[-1]) == 0: philo_id.pop() text_object = { "query": dict([i for i in request]), "philo_id": ' '.join([str(i) for i in philo_id]) } text_object['prev'] = neighboring_object_id(db, obj.prev, width) text_object['next'] = neighboring_object_id(db, obj.__next__, width) metadata_fields = {} for metadata in db.locals['metadata_fields']: metadata_fields[metadata] = obj[metadata] text_object['metadata_fields'] = metadata_fields if width != 9: citation_hrefs = citation_links(db, config, obj) else: doc_obj = db[obj.philo_id[0]] citation_hrefs = citation_links(db, config, doc_obj) citation = citations(obj, citation_hrefs, config, report="navigation") text_object['citation'] = citation text, imgs = get_text_obj(obj, config, request, db.locals["token_regex"], note=note) if config.navigation_formatting_regex: for pattern, replacement in config.navigation_formatting_regex: text = re.sub(r'%s' % pattern, '%s' % replacement, text) text_object['text'] = text text_object['imgs'] = imgs return text_object
def get_all_graphics(philo_id, config): db = DB(config.db_path + '/data/') c = db.dbh.cursor() approx_id = str(philo_id[0]) + ' 0 0 0 0 0 0 %' try: c.execute('SELECT facs FROM graphics WHERE philo_id LIKE ? AND facs IS NOT NULL AND facs != "" ORDER BY ROWID', (approx_id, )) graphics = [i["facs"].split() for i in c.fetchall() if i["facs"]] return graphics except sqlite3.OperationalError: return []
def alignment_to_text(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) link = byte_range_to_link(db, config, request) yield simplejson.dumps({"link": link})
def term_group(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) if not request["q"]: dump = json.dumps({"original_query": "", "term_groups": []}) else: hits = db.query(request["q"], request["method"], request["arg"], sort_order=request["sort_order"], **request.metadata) parsed = parse_query(request.q) group = group_terms(parsed) all_groups = split_terms(group) term_groups = [] for g in all_groups: term_group = '' not_started = False for kind, term in g: if kind == 'NOT': if not_started is False: not_started = True term_group += ' NOT ' elif kind == 'OR': term_group += '|' elif kind == "TERM": term_group += ' %s ' % term elif kind == "QUOTE": term_group += ' %s ' % term term_group = term_group.strip() term_groups.append(term_group) dump = json.dumps({ "term_groups": term_groups, "original_query": request.original_q }) yield dump.encode('utf8')
def get_notes(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) text_object = generate_text_object(request, config, note=True) yield json.dumps(text_object).encode('utf8')
def get_total_results(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) if request.no_q: if request.no_metadata: hits = db.get_all(db.locals['default_object_level'], request["sort_order"]) else: hits = db.query(sort_order=request["sort_order"], **request.metadata) else: hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) total_results = 0 hits.finish() total_results = len(hits) yield simplejson.dumps(total_results)
def metadata_list(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) metadata = request.term field = request.field yield autocomplete_metadata(metadata, field, db).encode('utf8')
def get_total_results(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) if request.no_q: if request.no_metadata: hits = db.get_all(db.locals['default_object_level']) else: hits = db.query(**request.metadata) else: hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) total_results = 0 hits.finish() total_results = len(hits) yield simplejson.dumps(total_results)
def kwic_results(request, config): """Fetch KWIC results""" db = DB(config.db_path + '/data/') hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) start, end, n = page_interval(request.results_per_page, hits, request.start, request.end) kwic_object = { "description": {"start": start, "end": end, "results_per_page": request.results_per_page}, "query": dict([i for i in request]) } kwic_object['results'] = [] for hit in hits[start - 1:end]: kwic_result = kwic_hit_object(hit, config, db) kwic_object['results'].append(kwic_result) kwic_object['results_length'] = len(hits) kwic_object["query_done"] = hits.done return kwic_object
def bibliography_results(request, config): db = DB(config.db_path + '/data/') if request.no_metadata: hits = db.get_all(db.locals['default_object_level'], request["sort_order"],) else: hits = db.query(**request.metadata) start, end, n = page_interval(request.results_per_page, hits, request.start, request.end) bibliography_object = { "description": { "start": start, "end": end, "n": n, "results_per_page": request.results_per_page }, "query": dict([i for i in request]), "default_object": db.locals['default_object_level'] } results = [] result_type = 'doc' for hit in hits[start - 1:end]: citation_hrefs = citation_links(db, config, hit) metadata_fields = {} for metadata in db.locals['metadata_fields']: metadata_fields[metadata] = hit[metadata] result_type = hit.type if hit.type == "doc": citation = citations(hit, citation_hrefs, config, report="bibliography") else: citation = citations(hit, citation_hrefs, config, report="concordance") results.append({ 'citation': citation, 'citation_links': citation_hrefs, 'philo_id': hit.philo_id, "metadata_fields": metadata_fields }) bibliography_object["results"] = results bibliography_object['results_length'] = len(hits) bibliography_object['query_done'] = hits.done bibliography_object['result_type'] = result_type return bibliography_object, hits
def time_series_tester(config): db = DB(config.db_path + '/data/') c = db.dbh.cursor() try: c.execute("SELECT COUNT(*) FROM toms WHERE %s IS NOT NULL" % config.time_series_year_field) count = c.fetchone()[0] if count > 0: return True else: return False except sqlite3.OperationalError: return False
def term_list(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) term = request.term if isinstance(term, list): term = term[-1] all_words = format_query(term, db, config)[:100] yield json.dumps(all_words).encode('utf8')
def get_start_end_date(environ, start_response): status = '200 OK' headers = [('Content-type', 'text/html; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) start_date, end_date = start_end_date(db, config, start_date=request.start_date, end_date=request.end_date) yield simplejson.dumps({"start_date": start_date, "end_date": end_date})
def access_control(environ, start_response): path = os.path.abspath(os.path.dirname(__file__)).replace('functions', '') + '/data/' db = DB(path, encoding='utf-8') if "open_access" in db.locals: ## failsafe in case the variable is not db.locals.py if db.locals['open_access']: return True elif check_previous_session(environ): return True else: access_value = check_access(db, environ) return access_value else: return True
def get_frequency(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = f.WebConfig() db = DB(config.db_path + '/data/') request = WSGIHandler(db, environ) setattr(request, 'frequency_field', json.dumps( eval('"%s"' % request.frequency_field))) if request.q == '' and request.no_q: if request.no_metadata: hits = db.get_all(db.locals['default_object_level']) else: hits = db.query(**request.metadata) else: hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) hits.finish() results = r.generate_frequency(hits, request, db, config) results['results'] = sorted(results['results'].iteritems(), key=lambda x: x[1]['count'], reverse=True) yield json.dumps(results)
def get_text_object(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) path = config.db_path zeros = 7 - len(request.philo_id) if zeros: request.philo_id += zeros * " 0" obj = ObjectWrapper(request['philo_id'].split(), db) text_object = generate_text_object(request, config) yield simplejson.dumps(text_object)
def get_sorted_kwic(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') input_object = json.loads(environ['wsgi.input'].read().decode('utf8', 'ignore')) all_results = input_object['results'] query_string = input_object['query_string'] sort_keys = [i for i in input_object["sort_keys"] if i] environ['QUERY_STRING'] = query_string request = WSGIHandler(environ, config) sorted_hits = get_sorted_hits(all_results, sort_keys, request, config, db, input_object['start'], input_object['end']) yield json.dumps(sorted_hits).encode('utf8')
def login_access(environ, request, config, headers): db = DB(config.db_path + '/data/') if request.authenticated: access = True else: if request.username and request.password: access = check_login_info(config, request) if access: incoming_address = environ['REMOTE_ADDR'] token = make_token(incoming_address, db) if token: h, ts = token headers.append(("Set-Cookie", "hash=%s" % h)) headers.append(("Set-Cookie", "timestamp=%s" % ts)) else: access = False return access, headers
def access_request(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] config = WebConfig( os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) access, headers = login_access(environ, request, config, headers) start_response(status, headers) if access: yield simplejson.dumps({'access': True}) else: incoming_address, domain_name = access_control.get_client_info(environ) yield simplejson.dumps({ 'access': False, "incoming_address": incoming_address, "domain_name": domain_name })
def group_by_metadata(request, config): citation_types = json.loads(request.citation) db = DB(config.db_path + "/data/") cursor = db.dbh.cursor() query = """select * from toms where philo_type="doc" and %s=?""" % request.group_by_field cursor.execute(query, (request.query,)) result_group = [] for doc in cursor: obj = db[doc["philo_id"]] links = citation_links(db, config, obj) citation = citations(obj, links, config, report="landing_page", citation_type=citation_types) result_group.append({"metadata": get_all_metadata(db, doc), "citation": citation}) return json.dumps( { "display_count": request.display_count, "content_type": request.group_by_field, "content": [{"prefix": request.query, "results": result_group}], } )
def main(db_path): """Grab words from words table and dump to file""" philo_db = DB(db_path) words_and_ids_path = os.path.join(db_path, "words_and_philo_ids") status = os.system("mkdir -p %s" % words_and_ids_path) if status != 0: print( "Could not create %s. Please check your write permissions to the parent directory" % words_and_ids_path) sys.exit(status) # Get all doc ids cursor = philo_db.dbh.cursor() cursor.execute( 'SELECT philo_id from toms where philo_type="doc" order by rowid') docs = [i["philo_id"].split()[0] for i in cursor.fetchall()] print("Processing %d documents" % len(docs)) for doc_id in docs: words = [] current_doc_id = "{} %".format(doc_id) cursor.execute( "SELECT rowid from words where philo_id like ? order by rowid limit 1", (current_doc_id, )) first_rowid = cursor.fetchone()[0] next_doc_id = "{} %".format(int(doc_id) + 1) cursor.execute( "SELECT rowid from words where philo_id like ? order by rowid limit 1", (next_doc_id, )) try: last_rowid = cursor.fetchone()[0] cursor.execute( "SELECT philo_name, philo_id from words where rowid >= ? and rowid < ? and philo_type=? and philo_type!=? order by rowid", (first_rowid, last_rowid, "word", "__philo_virtual")) except TypeError: # we've reached the end of the table and cursor returned None cursor.execute( "SELECT philo_name, philo_id from words where rowid >= ? and philo_type=? and philo_type!=? order by rowid", (first_rowid, "word", "__philo_virtual")) for word, philo_id in cursor.fetchall(): words.append({"token": word, "position": philo_id}) with open(os.path.join(words_and_ids_path, doc_id), "w") as output: output.write("\n".join(dumps(w) for w in words)) print("Processed document %s" % doc_id)
def collocation_results(request, config): """Fetch collocation results""" db = DB(config.db_path + '/data/') if request["collocate_distance"]: hits = db.query(request["q"], "proxy", int(request['collocate_distance']), **request.metadata) else: hits = db.query(request["q"], "cooc", request["arg"], **request.metadata) hits.finish() collocation_object = {"query": dict([i for i in request])} try: collocate_distance = int(request['collocate_distance']) except ValueError: # Getting an empty string since the keyword is not specificed in the URL collocate_distance = None if request.colloc_filter_choice == "nofilter": filter_list = [] else: filter_list = build_filter_list(request, config) collocation_object['filter_list'] = filter_list filter_list = set(filter_list) # Build list of search terms to filter out query_words = [] for group in get_expanded_query(hits): for word in group: word = word.replace('"', '') query_words.append(word) query_words = set(query_words) filter_list = filter_list.union(query_words) if request["collocate_distance"]: hits = db.query(request["q"], "proxy", int(request['collocate_distance']), raw_results=True, **request.metadata) else: hits = db.query(request["q"], "cooc", request["arg"], raw_results=True, **request.metadata) hits.finish() stored_sentence_id = None stored_sentence_counts = defaultdict(int) sentence_hit_count = 1 hits_done = request.start or 0 max_time = request.max_time or 10 all_collocates = defaultdict(lambda: {'count': 0}) cursor = db.dbh.cursor() start_time = timeit.default_timer() try: for hit in hits[hits_done:]: word_id = ' '.join([str(i) for i in hit[:6]]) + ' ' + str(hit[7]) query = """select parent, rowid from words where philo_id='%s' limit 1""" % word_id cursor.execute(query) result = cursor.fetchone() parent = result['parent'] if parent != stored_sentence_id: rowid = int(result['rowid']) sentence_hit_count = 1 stored_sentence_id = parent stored_sentence_counts = defaultdict(int) if collocate_distance: begin_rowid = rowid - collocate_distance if begin_rowid < 0: begin_rowid = 0 end_rowid = rowid + collocate_distance row_query = """select philo_name from words where parent='%s' and rowid between %d and %d""" % ( parent, begin_rowid, end_rowid) else: row_query = """select philo_name from words where parent='%s'""" % (parent, ) cursor.execute(row_query) for i in cursor.fetchall(): collocate = i["philo_name"] if collocate not in filter_list: stored_sentence_counts[collocate] += 1 else: sentence_hit_count += 1 for word in stored_sentence_counts: if stored_sentence_counts[word] < sentence_hit_count: continue all_collocates[word]['count'] += 1 hits_done += 1 elapsed = timeit.default_timer() - start_time # avoid timeouts by splitting the query if more than request.max_time (in # seconds) has been spent in the loop if elapsed > int(max_time): break except IndexError: collocation_object['hits_done'] = len(hits) collocation_object['collocates'] = all_collocates collocation_object["results_length"] = len(hits) if hits_done < collocation_object["results_length"]: collocation_object['more_results'] = True collocation_object['hits_done'] = hits_done else: collocation_object['more_results'] = False collocation_object['hits_done'] = collocation_object["results_length"] return collocation_object
def get_neighboring_words(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', '')) db = DB(config.db_path + '/data/') request = WSGIHandler(environ, config) try: index = int(request.hits_done) except: index = 0 max_time = int(request.max_time) kwic_words = [] start_time = timeit.default_timer() hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) c = db.dbh.cursor() for hit in hits[index:]: word_id = ' '.join([str(i) for i in hit.philo_id]) query = 'select rowid, philo_name, parent from words where philo_id="%s" limit 1' % word_id c.execute(query) results = c.fetchone() parent_sentence = results['parent'] highlighted_text = kwic_hit_object(hit, config, db)["highlighted_text"] highlighted_text = highlighted_text.translate(remove_punctuation_map) highlighted_text = highlighted_text.strip() result_obj = { "left": "", "right": "", "index": index, "q": highlighted_text } left_rowid = results["rowid"] - 10 right_rowid = results["rowid"] + 10 c.execute('select philo_name, philo_id from words where rowid between ? and ?', (left_rowid, results['rowid']-1)) result_obj["left"] = [] for i in c.fetchall(): result_obj["left"].append(i['philo_name'].decode('utf-8')) result_obj["left"].reverse() result_obj["left"] = ' '.join(result_obj["left"]) c.execute('select philo_name, philo_id from words where rowid between ? and ?', (results['rowid']+1, right_rowid)) result_obj["right"] = [] for i in c.fetchall(): result_obj["right"].append(i['philo_name'].decode('utf-8')) result_obj["right"] = ' '.join(result_obj["right"]) metadata_fields = {} for metadata in config.kwic_metadata_sorting_fields: result_obj[metadata] = hit[metadata].lower() kwic_words.append(result_obj) index += 1 elapsed = timeit.default_timer() - start_time if elapsed > max_time: # avoid timeouts by splitting the query if more than 10 seconds has been spent in the loop break yield simplejson.dumps({"results": kwic_words, "hits_done": index})
def filter_words_by_property(request, config): db = DB(config.db_path + '/data/') hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) concordance_object = {"query": dict([i for i in request])} # Do these need to be captured in wsgi_handler? word_property = request["word_property"] word_property_value = request["word_property_value"] word_property_total = request["word_property_total"] new_hitlist = [] results = [] position = 0 more_pages = False if request.start == 0: start = 1 else: start = request.start for hit in hits: ## get my chunk of text ## hit_val = get_word_attrib(hit, word_property, db) if hit_val == word_property_value: position += 1 if position < start: continue new_hitlist.append(hit) citation_hrefs = citation_links(db, config, hit) metadata_fields = {} for metadata in db.locals['metadata_fields']: metadata_fields[metadata] = hit[metadata] citation = citations(hit, citation_hrefs, config) context = fetch_concordance(db, hit, config.db_path, config.concordance_length) result_obj = { "philo_id": hit.philo_id, "citation": citation, "citation_links": citation_hrefs, "context": context, "metadata_fields": metadata_fields, "bytes": hit.bytes, "collocate_count": 1 } results.append(result_obj) if len(new_hitlist) == (request.results_per_page): more_pages = True break end = start + len(results) - 1 if len(results) < request.results_per_page: word_property_total = end else: word_property_total = end + 1 concordance_object['results'] = results concordance_object["query_done"] = hits.done concordance_object['results_length'] = word_property_total concordance_object["description"] = { "start": start, "end": end, "results_per_page": request.results_per_page, "more_pages": more_pages } return concordance_object
def generate_time_series(request, config): db = DB(config.db_path + '/data/') time_series_object = {'query': dict([i for i in request]), 'query_done': False} start_date, end_date = get_start_end_date(db, config, start_date=None, end_date=None) # Generate date ranges interval = int(request.year_interval) date_ranges = [] for i in xrange(start_date, end_date, interval): start = i end = i + interval - 1 if end > end_date: end = end_date date_range = "%d-%d" % (start, end) date_ranges.append((start, date_range)) absolute_count = defaultdict(int) date_counts = {} total_hits = 0 last_date_done = start_date start_time = timeit.default_timer() max_time = request.max_time or 10 for start_range, date_range in date_ranges: request.metadata[config.time_series_year_field] = date_range hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) hits.finish() params = {report: "concordance", start: "0", end: "0"} params[config.time_series_year_field] = date_range url = make_absolute_query_link(config, request, **params) absolute_count[start_range] = {"label": start_range, "count": len(hits), "url": url} # Get date total count if interval != '1': dates = [start_range] end_range = start_range + (int(request['year_interval']) - 1) query = 'select sum(word_count) from toms where %s between "%d" and "%d"' % (config.time_series_year_field, start_range, end_range) else: query = "select sum(word_count) from toms where %s='%s'" % (config.time_series_year_field, start_range) c = db.dbh.cursor() c.execute(query) date_counts[start_range] = c.fetchone()[0] or 0 total_hits += len(hits) print >> sys.stderr, "TOTAL", total_hits elapsed = timeit.default_timer() - start_time # avoid timeouts by splitting the query if more than request.max_time (in seconds) has been spent in the loop if elapsed > int(max_time): last_date_done = start_range break last_date_done = start_range time_series_object['results_length'] = total_hits if (last_date_done + int(request.year_interval)) >= end_date: time_series_object['more_results'] = False else: time_series_object['more_results'] = True time_series_object['new_start_date'] = last_date_done + int(request.year_interval) time_series_object['results'] = {'absolute_count': absolute_count, 'date_count': date_counts} return time_series_object
def frequency_results(request, config, sorted=False): """reads through a hitlist. looks up request.frequency_field in each hit, and builds up a list of unique values and their frequencies.""" db = DB(config.db_path + '/data/') biblio_search = False if request.q == '' and request.no_q: biblio_search = True if request.no_metadata: hits = db.get_all(db.locals['default_object_level'], sort_order=["rowid"], raw_results=True) else: hits = db.query(sort_order=["rowid"], raw_results=True, **request.metadata) else: hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata) if sorted: hits.finish() c = db.dbh.cursor() c.execute('select philo_id, %s from toms where %s is not null' % (request.frequency_field, request.frequency_field)) metadata_dict = {} for i in c.fetchall(): philo_id, field = i philo_id = tuple(int(s) for s in philo_id.split() if int(s)) metadata_dict[philo_id] = field counts = {} frequency_object = {} start_time = timeit.default_timer() last_hit_done = request.start obj_dict = {'doc': 1, 'div1': 2, 'div2': 3, 'div3': 4, 'para': 5, 'sent': 6, 'word': 7} metadata_type = db.locals["metadata_types"][request.frequency_field] try: object_level = obj_dict[metadata_type] except KeyError: # metadata_type == "div" pass try: for philo_id in hits[request.start:]: if not biblio_search: philo_id = tuple(list(philo_id[:6]) + [philo_id[7]]) if metadata_type == "div": key = "" for div in ["div1", "div2", "div3"]: if philo_id[:obj_dict[div]] in metadata_dict: key = metadata_dict[philo_id[:obj_dict[div]]] while not key: if philo_id[:4] in metadata_dict: key = metadata_dict[philo_id[:4]] break if philo_id[:5] in metadata_dict: key = metadata_dict[philo_id[:5]] break break if not key: last_hit_done += 1 continue else: try: key = metadata_dict[philo_id[:object_level]] except: last_hit_done += 1 continue if key not in counts: counts[key] = {"count": 0, 'metadata': {request.frequency_field: key}} counts[key]["url"] = make_absolute_query_link(config, request, frequency_field="", start="0", end="0", report=request.report, script='', **{request.frequency_field: '"%s"' % key}) if not biblio_search: query_metadata = dict([(k, v) for k, v in request.metadata.iteritems() if v]) query_metadata[request.frequency_field] = '"%s"' % key local_hits = db.query(**query_metadata) counts[key]["total_word_count"] = local_hits.get_total_word_count() counts[key]["count"] += 1 # avoid timeouts by splitting the query if more than # request.max_time (in seconds) has been spent in the loop elapsed = timeit.default_timer() - start_time last_hit_done += 1 if elapsed > 5: break frequency_object['results'] = counts frequency_object["hits_done"] = last_hit_done if last_hit_done == len(hits): new_metadata = dict([(k, v) for k, v in request.metadata.iteritems() if v]) new_metadata[request.frequency_field] = '"NULL"' if request.q == '' and request.no_q: new_hits = db.query(sort_order=["rowid"], raw_results=True, **new_metadata) else: new_hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **new_metadata) new_hits.finish() if len(new_hits): null_url = make_absolute_query_link(config, request, frequency_field="", start="0", end="0", report=request.report, script='', **{request.frequency_field: '"NULL"'}) local_hits = db.query(**new_metadata) if not biblio_search: frequency_object["results"]["NULL"] = {"count": len(new_hits), "url": null_url, "metadata": {request.frequency_field: '"NULL"'}, "total_word_count": local_hits.get_total_word_count()} else: frequency_object["results"]["NULL"] = {"count": len(new_hits), "url": null_url, "metadata": {request.frequency_field: '"NULL"'}} frequency_object['more_results'] = False else: frequency_object['more_results'] = True except IndexError: frequency_object['results'] = {} frequency_object['more_results'] = False frequency_object['results_length'] = len(hits) frequency_object['query'] = dict([i for i in request]) if sorted: frequency_object["results"] = sorted(frequency_object['results'].iteritems(), key=lambda x: x[1]['count'], reverse=True) return frequency_object
def get_neighboring_words(environ, start_response): status = '200 OK' headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")] start_response(status, headers) config = f.WebConfig() db = DB(config.db_path + '/data/') request = WSGIHandler(db, environ) try: index = int(request.hits_done) except: index = 0 max_time = int(request.max_time) kwic_words = [] start_time = timeit.default_timer() hits = db.query(request["q"], request["method"], request["arg"], **request.metadata) c = db.dbh.cursor() for hit in hits[index:]: word_id = ' '.join([str(i) for i in hit.philo_id]) query = 'select rowid, philo_name, parent from words where philo_id="%s" limit 1' % word_id c.execute(query) results = c.fetchone() parent_sentence = results['parent'] if request.direction == "left": c.execute( 'select philo_name, philo_id from words where parent=? and rowid < ?', (parent_sentence, results['rowid'])) string = [] for i in c.fetchall(): string.append(i['philo_name'].decode('utf-8')) string.reverse() string = ' '.join(string) elif request.direction == "right": c.execute( 'select philo_name, philo_id from words where parent=? and rowid > ?', (parent_sentence, results['rowid'])) string = [] for i in c.fetchall(): string.append(i['philo_name'].decode('utf-8')) string = ' '.join(string) else: string = "" metadata_fields = {} for metadata in config.kwic_metadata_sorting_fields: metadata_fields[metadata] = hit[metadata].lower() kwic_words.append((string, index, metadata_fields)) index += 1 elapsed = timeit.default_timer() - start_time if elapsed > max_time: # avoid timeouts by splitting the query if more than 10 seconds has been spent in the loop break yield json.dumps({"results": kwic_words, "hits_done": index})
def generate_time_series(request, config): db = DB(config.db_path + '/data/') time_series_object = {'query': dict([i for i in request]), 'query_done': False} # Invalid date range if request.start_date == 'invalid' or request.end_date == 'invalid': time_series_object['results_length'] = 0 time_series_object['more_results'] = False time_series_object['new_start_date'] = 0 time_series_object['results'] = {'absolute_count': {}, 'date_count': {}} return time_series_object start_date, end_date = get_start_end_date(db, config, start_date=request.start_date or None, end_date=request.end_date or None) # Generate date ranges interval = int(request.year_interval) date_ranges = [] # Make sure last date gets included in for loop below by adding one to last step for start in range(start_date, end_date+1, interval): end = start + interval - 1 if end > end_date: end = end_date date_range = "%d-%d" % (start, end) date_ranges.append((start, date_range)) absolute_count = defaultdict(int) date_counts = {} total_hits = 0 last_date_done = start_date start_time = timeit.default_timer() max_time = request.max_time or 10 for start_range, date_range in date_ranges: request.metadata[config.time_series_year_field] = date_range hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata) hits.finish() hit_len = len(hits) params = {"report": "concordance", "start": "0", "end": "0"} params[config.time_series_year_field] = date_range url = make_absolute_query_link(config, request, **params) absolute_count[start_range] = {"label": start_range, "count": hit_len, "url": url} # Get date total count if interval != '1': end_range = start_range + (int(request['year_interval']) - 1) query = 'select sum(word_count) from toms where %s between "%d" and "%d"' % (config.time_series_year_field, start_range, end_range) else: query = "select sum(word_count) from toms where %s='%s'" % (config.time_series_year_field, start_range) cursor = db.dbh.cursor() cursor.execute(query) date_counts[start_range] = cursor.fetchone()[0] or 0 total_hits += hit_len elapsed = timeit.default_timer() - start_time last_date_done = start_range # avoid timeouts by splitting the query if more than request.max_time # (in seconds) has been spent in the loop if elapsed > int(max_time): break time_series_object['results_length'] = total_hits if (last_date_done + int(request.year_interval)) >= end_date: time_series_object['more_results'] = False else: time_series_object['more_results'] = True time_series_object['new_start_date'] = last_date_done + int(request.year_interval) time_series_object['results'] = {'absolute_count': absolute_count, 'date_count': date_counts} return time_series_object