コード例 #1
0
def term_group(environ, start_response):
    status = "200 OK"
    headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace("scripts", ""))
    db = DB(config.db_path + "/data/")
    request = WSGIHandler(environ, config)
    if not request["q"]:
        dump = json.dumps({"original_query": "", "term_groups": []})
    else:
        hits = db.query(
            request["q"], request["method"], request["arg"], sort_order=request["sort_order"], **request.metadata
        )
        parsed = parse_query(request.q)
        group = group_terms(parsed)
        all_groups = split_terms(group)
        term_groups = []
        for g in all_groups:
            term_group = ""
            not_started = False
            for kind, term in g:
                if kind == "NOT":
                    if not_started is False:
                        not_started = True
                        term_group += " NOT "
                elif kind == "OR":
                    term_group += "|"
                elif kind == "TERM":
                    term_group += " %s " % term
                elif kind == "QUOTE":
                    term_group += " %s " % term
            term_group = term_group.strip()
            term_groups.append(term_group)
        dump = json.dumps({"term_groups": term_groups, "original_query": request.original_q})
    yield dump.encode("utf8")
コード例 #2
0
def bibliography_results(request, config):
    """Fetch bibliography results"""
    db = DB(config.db_path + "/data/")
    if request.no_metadata:
        hits = db.get_all(db.locals["default_object_level"], request["sort_order"])
    else:
        hits = db.query(sort_order=request["sort_order"], **request.metadata)
    if (
        request.simple_bibliography == "all"
    ):  # request from simple landing page report which gets all biblio in load order
        hits.finish()
        start = 1
        end = len(hits)
        page_num = end
    else:
        start, end, page_num = page_interval(request.results_per_page, hits, request.start, request.end)
    bibliography_object = {
        "description": {"start": start, "end": end, "n": page_num, "results_per_page": request.results_per_page},
        "query": dict([i for i in request]),
        "default_object": db.locals["default_object_level"],
    }
    results = []
    result_type = "doc"
    for hit in hits[start - 1 : end]:
        citation_hrefs = citation_links(db, config, hit)
        metadata_fields = {}
        for metadata in db.locals["metadata_fields"]:
            metadata_fields[metadata] = hit[metadata]
        result_type = hit.object_type
        if request.simple_bibliography == "all":
            citation = citations(hit, citation_hrefs, config, report="simple_landing")
        else:
            citation = citations(hit, citation_hrefs, config, report="bibliography", result_type=result_type)
        if config.dictionary_bibliography is False or result_type == "doc":
            results.append(
                {
                    "citation": citation,
                    "citation_links": citation_hrefs,
                    "philo_id": hit.philo_id,
                    "metadata_fields": metadata_fields,
                    "object_type": result_type,
                }
            )
        else:
            context = get_text_obj(hit, config, request, db.locals["token_regex"], images=False)
            results.append(
                {
                    "citation": citation,
                    "citation_links": citation_hrefs,
                    "philo_id": hit.philo_id,
                    "metadata_fields": metadata_fields,
                    "context": context,
                    "object_type": result_type,
                }
            )
    bibliography_object["results"] = results
    bibliography_object["results_length"] = len(hits)
    bibliography_object["query_done"] = hits.done
    bibliography_object["result_type"] = result_type
    return bibliography_object, hits
コード例 #3
0
def kwic_results(request, config):
    """Fetch KWIC results"""
    db = DB(config.db_path + "/data/")
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    start, end, n = page_interval(request.results_per_page, hits,
                                  request.start, request.end)
    kwic_object = {
        "description": {
            "start": start,
            "end": end,
            "results_per_page": request.results_per_page
        },
        "query": dict([i for i in request]),
    }
    kwic_object["results"] = []

    for hit in hits[start - 1:end]:
        kwic_result = kwic_hit_object(hit, config, db)
        kwic_object["results"].append(kwic_result)

    kwic_object["results_length"] = len(hits)
    kwic_object["query_done"] = hits.done

    return kwic_object
コード例 #4
0
def lookup_word_service(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    cursor = db.dbh.cursor()

    if request.report == "concordance":
        hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
        context_size = config['concordance_length'] * 3
        hit = hits[int(request.position)]
        bytes = hit.bytes
        hit_span = hit.bytes[-1] - hit.bytes[0]
        length = context_size + hit_span + context_size
        bytes, start_byte = adjust_bytes(bytes, length)
        end_byte = start_byte + length
        filename = hit.filename
        token = request.selected
    elif request.report == "navigation":

        token = request.selected
        philo_id = request.philo_id.split(" ")
        text_obj = db[philo_id]
        start_byte, end_byte = int(text_obj.start_byte), int(text_obj.end_byte)
        filename = text_obj.filename
#        print >> sys.stderr, "WORD LOOKUP FROM NAVIGATION", request.philo_id,request.selected, start_byte, end_byte, filename
    else:
        pass
#    print >> sys.stderr, "TOKEN", token, "BYTES: ", start_byte, end_byte, "FILENAME: ", filename, "POSITION", request.position
    token_n = 0
    yield lookup_word(db, cursor, token, token_n, start_byte, end_byte, filename).encode('utf8')
コード例 #5
0
def landing_page_bibliography(request, config):
    db = DB(config.db_path + "/data/")
    object_level = request.object_level
    if object_level and object_level in ["doc", "div1", "div2", "div3"]:
        hits = db.get_all(object_level)
    else:
        hits = db.get_all(db.locals["default_object_level"])
    results = []
    c = db.dbh.cursor()
    for hit in hits:
        hit_object = {}
        for field in db.locals["metadata_fields"]:
            hit_object[field] = hit[field] or ""
        if object_level == "doc":
            hit_object["philo_id"] = hit.philo_id[0]
        else:
            hit_object["philo_id"] = "/".join([str(i) for i in hit.philo_id])
        doc_id = str(hit.philo_id[0]) + " 0 0 0 0 0 0"
        next_doc_id = str(hit.philo_id[0] + 1) + " 0 0 0 0 0 0"
        c.execute('select rowid from toms where philo_id="%s"' % doc_id)
        doc_row = c.fetchone()["rowid"]
        c.execute('select rowid from toms where philo_id="%s"' % next_doc_id)
        try:
            next_doc_row = c.fetchone()["rowid"]
        except TypeError:  # if this is the last doc, just get the last rowid in the table.
            c.execute("select max(rowid) from toms;")
            next_doc_row = c.fetchone()[0]
        try:
            c.execute(
                'select * from toms where rowid between %d and %d and head is not null and head !="" limit 1'
                % (doc_row, next_doc_row))
        except sqlite3.OperationalError:  # no type field in DB
            c.execute(
                'select * from toms where rowid between ? and ? and head is not null and head !="" limit 1',
                (doc_row, next_doc_row),
            )
        try:
            start_head = c.fetchone()["head"].decode("utf-8")
            start_head = start_head.lower().title().encode("utf-8")
        except Exception as e:
            print(repr(e), file=sys.stderr)
            start_head = ""
        try:
            c.execute(
                'select head from toms where rowid between %d and %d and head is not null and head !="" order by rowid desc limit 1'
                % (doc_row, next_doc_row))
        except sqlite3.OperationalError:  # no type field in DB
            c.execute(
                'select head from toms where rowid between %d and %d and head is not null and head !="" order by rowid desc limit 1'
                % (doc_row, next_doc_row))
        try:
            end_head = c.fetchone()["head"]
            end_head = end_head.decode("utf-8").lower().title().encode("utf-8")
        except:
            end_head = ""
        hit_object["start_head"] = start_head
        hit_object["end_head"] = end_head

        results.append(hit_object)
    return results
コード例 #6
0
def get_start_end_date(environ, start_response):
    status = "200 OK"
    headers = [("Content-type", "text/html; charset=UTF-8"),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace("scripts", ""))
    db = DB(config.db_path + "/data/")
    request = WSGIHandler(environ, config)
    start_date, end_date = start_end_date(db,
                                          config,
                                          start_date=request.start_date,
                                          end_date=request.end_date)
    request.metadata["year"] = "{}-{}".format(start_date, end_date)
    request["start_date"] = ""
    request["end_date"] = ""
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    total_results = 0
    hits.finish()
    total_results = len(hits)
    yield json.dumps({
        "start_date": start_date,
        "end_date": end_date,
        "total_results": total_results
    }).encode("utf8")
コード例 #7
0
def term_list(environ, start_response):
    status = "200 OK"
    headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace("scripts", ""))
    db = DB(config.db_path + "/data/")
    request = WSGIHandler(environ, config)
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    hits.finish()
    expanded_terms = get_expanded_query(hits)
    yield json.dumps(expanded_terms[0]).encode("utf8")
コード例 #8
0
def generate_word_frequency(request, config):
    """reads through a hitlist. looks up request["field"] in each hit, and builds up a list of
       unique values and their frequencies."""
    db = DB(config.db_path + "/data/")
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    field = request["field"]
    counts = {}
    frequency_object = {}
    start_time = timeit.default_timer()
    last_hit_done = request.start
    try:
        for n in hits[request.start:]:
            key = get_word_attrib(n, field, db)
            if not key:
                # NULL is a magic value for queries, don't change it
                # recklessly.
                key = "NULL"
            if key not in counts:
                counts[key] = 0
            counts[key] += 1
            elapsed = timeit.default_timer() - start_time
            last_hit_done += 1
            if elapsed > 5:
                break

        table = {}
        for k, v in counts.items():
            url = make_absolute_query_link(
                config,
                request,
                start="0",
                end="0",
                report="word_property_filter",
                word_property=field,
                word_property_value=k,
            )
            table[k] = {"count": v, "url": url}

        frequency_object["results"] = table
        frequency_object["hits_done"] = last_hit_done
        if last_hit_done == len(hits):
            frequency_object["more_results"] = False
        else:
            frequency_object["more_results"] = True

    except IndexError:
        frequency_object["results"] = {}
        frequency_object["more_results"] = False

    frequency_object["results_length"] = len(hits)
    frequency_object["query"] = dict([i for i in request])

    return frequency_object
コード例 #9
0
def generate_word_frequency(request, config):
    """reads through a hitlist. looks up request["field"] in each hit, and builds up a list of
       unique values and their frequencies."""
    db = DB(config.db_path + "/data/")
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    field = request["field"]
    counts = {}
    frequency_object = {}
    start_time = timeit.default_timer()
    last_hit_done = request.start
    try:
        for n in hits[request.start :]:
            key = get_word_attrib(n, field, db)
            if not key:
                # NULL is a magic value for queries, don't change it
                # recklessly.
                key = "NULL"
            if key not in counts:
                counts[key] = 0
            counts[key] += 1
            elapsed = timeit.default_timer() - start_time
            last_hit_done += 1
            if elapsed > 5:
                break

        table = {}
        for k, v in counts.items():
            url = make_absolute_query_link(
                config,
                request,
                start="0",
                end="0",
                report="word_property_filter",
                word_property=field,
                word_property_value=k,
            )
            table[k] = {"count": v, "url": url}

        frequency_object["results"] = table
        frequency_object["hits_done"] = last_hit_done
        if last_hit_done == len(hits):
            frequency_object["more_results"] = False
        else:
            frequency_object["more_results"] = True

    except IndexError:
        frequency_object["results"] = {}
        frequency_object["more_results"] = False

    frequency_object["results_length"] = len(hits)
    frequency_object["query"] = dict([i for i in request])

    return frequency_object
コード例 #10
0
def term_list(environ, start_response):
    status = "200 OK"
    headers = [("Content-type", "application/json; charset=UTF-8"),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace("scripts", ""))
    db = DB(config.db_path + "/data/")
    request = WSGIHandler(environ, config)
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    hits.finish()
    expanded_terms = get_expanded_query(hits)
    yield json.dumps(expanded_terms[0]).encode("utf8")
コード例 #11
0
def get_more_context(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    hit_num = int(request.hit_num)
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    context_size = config['concordance_length'] * 3
    hit_context = get_concordance_text(db, hits[hit_num], config.db_path,
                                       context_size)
    yield json.dumps(hit_context).encode('utf8')
コード例 #12
0
def concordance_results(request, config):
    """Fetch concordances results."""
    db = DB(config.db_path + "/data/")
    if request.collocation_type:
        first_hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
        second_hits = db.query(request["left"], request["method"], request["arg"], **request.metadata)
        hits = CombinedHitlist(first_hits, second_hits)
    else:
        hits = db.query(
            request["q"], request["method"], request["arg"], sort_order=request["sort_order"], **request.metadata
        )
    start, end, page_num = page_interval(request["results_per_page"], hits, request.start, request.end)

    concordance_object = {
        "description": {"start": start, "end": end, "results_per_page": request.results_per_page},
        "query": dict([i for i in request]),
        "default_object": db.locals["default_object_level"],
    }

    formatting_regexes = []
    if config.concordance_formatting_regex:
        for pattern, replacement in config.concordance_formatting_regex:
            compiled_regex = re.compile(r"%s" % pattern)
            formatting_regexes.append((compiled_regex, replacement))
    results = []
    for hit in hits[start - 1 : end]:
        citation_hrefs = citation_links(db, config, hit)
        metadata_fields = {}
        for metadata in db.locals["metadata_fields"]:
            metadata_fields[metadata] = hit[metadata]
        citation = citations(hit, citation_hrefs, config, report="concordance")
        context = get_concordance_text(db, hit, config.db_path, config.concordance_length)
        if formatting_regexes:
            for formatting_regex, replacement in formatting_regexes:
                context = formatting_regex.sub(r"%s" % replacement, context)
        result_obj = {
            "philo_id": hit.philo_id,
            "citation": citation,
            "citation_links": citation_hrefs,
            "context": context,
            "metadata_fields": metadata_fields,
            "bytes": hit.bytes,
        }
        results.append(result_obj)
    concordance_object["results"] = results
    concordance_object["results_length"] = len(hits)
    concordance_object["query_done"] = hits.done
    return concordance_object
コード例 #13
0
def get_start_end_date(environ, start_response):
    status = "200 OK"
    headers = [("Content-type", "text/html; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace("scripts", ""))
    db = DB(config.db_path + "/data/")
    request = WSGIHandler(environ, config)
    start_date, end_date = start_end_date(db, config, start_date=request.start_date, end_date=request.end_date)
    request.metadata["year"] = "{}-{}".format(start_date, end_date)
    request["start_date"] = ""
    request["end_date"] = ""
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    total_results = 0
    hits.finish()
    total_results = len(hits)
    yield json.dumps({"start_date": start_date, "end_date": end_date, "total_results": total_results}).encode("utf8")
コード例 #14
0
def get_all_page_images(philo_id, config, current_obj_imgs):
    """Get all page images"""
    if current_obj_imgs[0]:
        # We know there are images
        db = DB(config.db_path + "/data/")
        cursor = db.dbh.cursor()
        approx_id = str(philo_id[0]) + " 0 0 0 0 0 0 %"
        try:
            cursor.execute(
                'select * from pages where philo_id like ? and facs is not null and facs != ""',
                (approx_id, ))
            current_obj_imgs = set(current_obj_imgs)
            all_imgs = [tuple(i["facs"].split()) for i in cursor]
        except sqlite3.OperationalError:
            all_imgs = []
        if not all_imgs:
            try:
                cursor.execute(
                    'select * from pages where philo_id like ? and id is not null and id != ""',
                    (approx_id, ))
                current_obj_imgs = set(current_obj_imgs)
                all_imgs = [tuple(i["id"].split()) for i in cursor]
            except sqlite3.OperationalError:
                return []
        return all_imgs
    else:
        return []
コード例 #15
0
def group_by_metadata(request, config):
    citation_types = json.loads(request.citation)
    db = DB(config.db_path + "/data/")
    cursor = db.dbh.cursor()
    query = """select * from toms where philo_type="doc" and %s=?""" % request.group_by_field
    cursor.execute(query, (request.query, ))
    result_group = []
    for doc in cursor:
        obj = db[doc["philo_id"]]
        links = citation_links(db, config, obj)
        citation = citations(obj,
                             links,
                             config,
                             report="landing_page",
                             citation_type=citation_types)
        result_group.append({
            "metadata": get_all_metadata(db, doc),
            "citation": citation
        })
    return json.dumps({
        "display_count":
        request.display_count,
        "content_type":
        request.group_by_field,
        "content": [{
            "prefix": request.query,
            "results": result_group
        }],
    })
コード例 #16
0
def get_more_context(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    hit_num = int(request.hit_num)
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    context_size = config['concordance_length'] * 3
    hit_context = get_concordance_text(db, hits[hit_num], config.db_path,
                                       context_size)
    yield json.dumps(hit_context).encode('utf8')
コード例 #17
0
ファイル: kwic.py プロジェクト: ARTFL-Project/PhiloLogic4
def kwic_results(request, config):
    """Fetch KWIC results"""
    db = DB(config.db_path + "/data/")
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    start, end, n = page_interval(request.results_per_page, hits, request.start, request.end)
    kwic_object = {
        "description": {"start": start, "end": end, "results_per_page": request.results_per_page},
        "query": dict([i for i in request]),
    }
    kwic_object["results"] = []

    for hit in hits[start - 1 : end]:
        kwic_result = kwic_hit_object(hit, config, db)
        kwic_object["results"].append(kwic_result)

    kwic_object["results_length"] = len(hits)
    kwic_object["query_done"] = hits.done

    return kwic_object
コード例 #18
0
def get_notes(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    text_object = generate_text_object(request, config, note=True)
    yield json.dumps(text_object).encode('utf8')
コード例 #19
0
def alignment_to_text(environ, start_response):
    status = "200 OK"
    headers = [("Content-type", "application/json; charset=UTF-8"),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace("scripts", ""))
    db = DB(config.db_path + "/data/")
    request = WSGIHandler(environ, config)
    link = byte_range_to_link(db, config, request)
    yield dumps({"link": link}).encode("utf8")
コード例 #20
0
def retrieve_similar_docs(philo_db: str, philo_id: str, num: int = 20):
    philo_id = philo_id.strip()
    try:
        annoy_id = int(PHILO_ID_TO_ANNOY[philo_db][philo_id])
    except KeyError:
        db = DB(f"{PHILO_PATHS[philo_db]}/data")
        hit = db[philo_id]
        text = get_text(hit, hit.start_byte, hit.end_byte - hit.start_byte, PHILO_PATHS[philo_db],)
        return submit_passage(text.decode("utf8"), num=num)
    newsims = INDEX.get_nns_by_item(annoy_id, num + 1, include_distances=True)
    results = process_annoy_results(newsims)
    return results[1:]
コード例 #21
0
def get_total_results(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    if request.no_q:
        if request.no_metadata:
            hits = db.get_all(db.locals['default_object_level'], request["sort_order"])
        else:
            hits = db.query(sort_order=request["sort_order"], **request.metadata)
    else:
        hits = db.query(request["q"], request["method"], request["arg"],
                        **request.metadata)
    total_results = 0
    hits.finish()
    total_results = len(hits)

    yield json.dumps(total_results).encode('utf8')
コード例 #22
0
def metadata_list(environ, start_response):
    status = "200 OK"
    headers = [("Content-type", "application/json; charset=UTF-8"),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace("scripts", ""))
    db = DB(config.db_path + "/data/")
    request = WSGIHandler(environ, config)
    metadata = request.term
    field = request.field
    yield autocomplete_metadata(metadata, field, db).encode("utf8")
コード例 #23
0
def get_all_graphics(philo_id, config):
    db = DB(config.db_path + "/data/")
    cursor = db.dbh.cursor()
    approx_id = str(philo_id[0]) + " 0 0 0 0 0 0 %"
    try:
        cursor.execute(
            'SELECT facs FROM graphics WHERE philo_id LIKE ? AND facs IS NOT NULL AND facs != "" ORDER BY ROWID',
            (approx_id, ),
        )
        graphics = [i["facs"].split() for i in cursor if i["facs"]]
        return graphics
    except sqlite3.OperationalError:
        return []
コード例 #24
0
def lookup_word_service(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    cursor = db.dbh.cursor()

    if request.report == "concordance":
        hits = db.query(request["q"], request["method"], request["arg"],
                        **request.metadata)
        context_size = config['concordance_length'] * 3
        hit = hits[int(request.position)]
        bytes = hit.bytes
        hit_span = hit.bytes[-1] - hit.bytes[0]
        length = context_size + hit_span + context_size
        bytes, start_byte = adjust_bytes(bytes, length)
        end_byte = start_byte + length
        filename = hit.filename
        token = request.selected
    elif request.report == "navigation":

        token = request.selected
        philo_id = request.philo_id.split(" ")
        text_obj = db[philo_id]
        start_byte, end_byte = int(text_obj.start_byte), int(text_obj.end_byte)
        filename = text_obj.filename
#        print >> sys.stderr, "WORD LOOKUP FROM NAVIGATION", request.philo_id,request.selected, start_byte, end_byte, filename
    else:
        pass


#    print >> sys.stderr, "TOKEN", token, "BYTES: ", start_byte, end_byte, "FILENAME: ", filename, "POSITION", request.position
    token_n = 0
    yield lookup_word(db, cursor, token, token_n, start_byte, end_byte,
                      filename).encode('utf8')
コード例 #25
0
def time_series_tester(config):
    db = DB(config.db_path + "/data/")
    c = db.dbh.cursor()
    try:
        c.execute("SELECT COUNT(*) FROM toms WHERE %s IS NOT NULL" %
                  config.time_series_year_field)
        count = c.fetchone()[0]
        if count > 0:
            return True
        else:
            return False
    except sqlite3.OperationalError:
        return False
コード例 #26
0
def term_list(environ, start_response):
    status = "200 OK"
    headers = [("Content-type", "application/json; charset=UTF-8"),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace("scripts", ""))
    db = DB(config.db_path + "/data/")
    request = WSGIHandler(environ, config)
    term = request.term
    if isinstance(term, list):
        term = term[-1]
    all_words = format_query(term, db, config)[:100]
    yield json.dumps(all_words).encode("utf8")
コード例 #27
0
def get_text_object(environ, start_response):
    status = "200 OK"
    headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace("scripts", ""))
    db = DB(config.db_path + "/data/")
    request = WSGIHandler(environ, config)
    path = config.db_path
    zeros = 7 - len(request.philo_id)
    if zeros:
        request.philo_id += zeros * " 0"
    obj = ObjectWrapper(request["philo_id"].split(), db)
    text_object = generate_text_object(request, config)
    yield json.dumps(text_object).encode("utf8")
コード例 #28
0
def get_total_results(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    if request.no_q:
        if request.no_metadata:
            hits = db.get_all(db.locals['default_object_level'],
                              request["sort_order"])
        else:
            hits = db.query(sort_order=request["sort_order"],
                            **request.metadata)
    else:
        hits = db.query(request["q"], request["method"], request["arg"],
                        **request.metadata)
    total_results = 0
    hits.finish()
    total_results = len(hits)

    yield json.dumps(total_results).encode('utf8')
コード例 #29
0
def login_access(environ, request, config, headers):
    db = DB(config.db_path + "/data/")
    if request.authenticated:
        access = True
    else:
        if request.username and request.password:
            access = check_login_info(config, request)
            if access:
                incoming_address = environ["REMOTE_ADDR"]
                token = make_token(incoming_address, db)
                if token:
                    h, ts = token
                    headers.append(("Set-Cookie", "hash=%s" % h))
                    headers.append(("Set-Cookie", "timestamp=%s" % ts))
        else:
            access = False
    return access, headers
コード例 #30
0
def get_sorted_kwic(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    input_object = json.loads(environ['wsgi.input'].read().decode(
        'utf8', 'ignore'))
    all_results = input_object['results']
    query_string = input_object['query_string']
    sort_keys = [i for i in input_object["sort_keys"] if i]
    environ['QUERY_STRING'] = query_string
    request = WSGIHandler(environ, config)
    sorted_hits = get_sorted_hits(all_results, sort_keys, request, config, db,
                                  input_object['start'], input_object['end'])
    yield json.dumps(sorted_hits).encode('utf8')
コード例 #31
0
def get_first_page(philo_id, config):
    """This function will fetch the first page of any given text object in case there's no <pb>
    starting the object"""
    db = DB(config.db_path + "/data/")
    c = db.dbh.cursor()
    if len(philo_id) < 9:
        c.execute("select start_byte, end_byte from toms where philo_id=?",
                  (" ".join([str(i) for i in philo_id]), ))
        result = c.fetchone()
        start_byte = result["start_byte"]
        approx_id = str(philo_id[0]) + " 0 0 0 0 0 0 %"
        try:
            c.execute(
                "select * from pages where philo_id like ? and end_byte >= ? limit 1",
                (approx_id, start_byte))
        except:
            return {"filename": "", "start_byte": ""}
    else:
        c.execute("select * from pages where philo_id like ? limit 1",
                  (" ".join([str(i) for i in philo_id]), ))
    page_result = c.fetchone()
    try:
        filename = page_result["facs"]
    except (IndexError, TypeError):
        filename = ""
    if not filename:
        try:
            filename = page_result["id"] or ""
        except (IndexError, TypeError):
            pass
    try:
        n = page_result["n"] or ""
        page = {
            "filename": filename.split(),
            "n": n,
            "start_byte": page_result["start_byte"],
            "end_byte": page_result["end_byte"],
        }
        return page
    except:  # Let's play it safe
        return {"filename": "", "start_byte": ""}
コード例 #32
0
def main(object_level, db_path):
    metadata_fields = {}
    doc_filenames = {}
    database = DB(os.path.join(db_path, "data"))
    cursor = database.dbh.cursor()
    cursor.execute(
        "SELECT philo_id, filename FROM toms WHERE philo_type='doc'")
    for philo_id, filename in cursor:
        doc_id = philo_id.split()[0]
        doc_filenames[doc_id] = filename
    cursor.execute("SELECT * FROM toms WHERE philo_type=?", (object_level, ))
    for result in cursor:
        fields = result
        philo_id = "_".join(
            fields["philo_id"].split()[:object_levels[object_level]])
        metadata_fields[philo_id] = {}
        for field in database.locals["metadata_fields"]:
            metadata_fields[philo_id][field] = result[field] or ""
        doc_id = result["philo_id"].split()[0]
        metadata_fields[philo_id]["filename"] = doc_filenames[doc_id]
    with open("metadata.json", "w") as metadata_file:
        json.dump(metadata_fields, metadata_file)
コード例 #33
0
def process_annoy_results(newsims) -> List[Dict[str, Union[str, Dict[str, str]]]]:
    simscores = list(newsims[1])
    matchdocs = list(newsims[0])
    results: List[Dict[str, Union[str, Dict[str, str]]]] = []
    db_cache = {philo_db: DB(f'{config["path"]}/data') for philo_db, config in APP_CONFIG["philoDBs"].items()}
    for doc, score in zip(matchdocs, simscores):
        doc_id = ANNOY_TO_PHILO_ID[str(doc)]
        hit = db_cache[doc_id["philo_db"]][doc_id["philo_id"]]
        results.append(
            {
                "philo_db": doc_id["philo_db"],
                "metadata": {
                    "author": hit.author,
                    "title": hit.title,
                    "date": hit.year,
                    "head": hit.head,
                    "philo_id": doc_id["philo_id"],
                },
                "score": score,
            }
        )
    return results
コード例 #34
0
def get_tei_header(request, config):
    path = config.db_path
    db = DB(path + "/data")
    obj = ObjectWrapper(request["philo_id"].split(), db)
    filename = path + "/data/TEXT/" + obj.filename
    parser = etree.XMLParser(remove_blank_text=True, recover=True)
    xml_tree = etree.parse(filename, parser)
    header = xml_tree.find("teiHeader")
    try:
        header_text = etree.tostring(header, pretty_print=True).decode("utf8")
    except TypeError as e:  # workaround for when lxml doesn't find the header for whatever reason
        start = False
        header_text = ""
        with open(filename, encoding="utf8") as file:
            file_content = file.read()
            try:
                start_header_index = re.search(r"<teiheader", file_content,
                                               re.I).start()
                end_header_index = re.search(r"</teiheader>", file_content,
                                             re.I).end()
            except AttributeError:  # tag not found
                return ""
            header_text = file_content[start_header_index:end_header_index]
    return header_text.replace("<", "&lt;").replace(">", "&gt;")
コード例 #35
0
def main(db_path):
    """Grab words from words table and dump to file"""
    philo_db = DB(db_path)
    words_and_ids_path = os.path.join(db_path, "words_and_philo_ids")
    status = os.system("mkdir -p %s" % words_and_ids_path)
    if status != 0:
        print(
            "Could not create %s. Please check your write permissions to the parent directory"
            % words_and_ids_path)
        sys.exit(status)
    cursor = philo_db.dbh.cursor()
    cursor.execute(
        'SELECT philo_name, philo_id, start_byte, end_byte from words')
    current_doc_id = "1"
    current_words = []
    for word, philo_id, start_byte, end_byte in cursor:
        doc_id = philo_id.split()[0]
        word_obj = {
            "token": word,
            "position": philo_id,
            "start_byte": start_byte,
            "end_byte": end_byte
        }
        if doc_id != current_doc_id:
            with open(os.path.join(words_and_ids_path, current_doc_id),
                      "w") as output:
                output.write("\n".join(current_words))
                print("Processed document %s" % current_doc_id, flush=True)
            current_words = []
            current_doc_id = doc_id
        current_words.append(json.dumps(word_obj))
    if current_words:
        with open(os.path.join(words_and_ids_path, current_doc_id),
                  "w") as output:
            output.write("\n".join(current_words))
            print("Processed document %s" % current_doc_id, flush=True)
コード例 #36
0
def collocation_results(request, config):
    """Fetch collocation results"""
    db = DB(config.db_path + "/data/")
    if request["collocate_distance"]:
        hits = db.query(request["q"], "proxy", int(request["collocate_distance"]), **request.metadata)
    else:
        hits = db.query(request["q"], "cooc", request["arg"], **request.metadata)
    hits.finish()
    collocation_object = {"query": dict([i for i in request])}

    try:
        collocate_distance = int(request["collocate_distance"])
    except ValueError:  # Getting an empty string since the keyword is not specificed in the URL
        collocate_distance = None

    if request.colloc_filter_choice == "nofilter":
        filter_list = []
    else:
        filter_list = build_filter_list(request, config)
    collocation_object["filter_list"] = filter_list
    filter_list = set(filter_list)

    # Build list of search terms to filter out
    query_words = []
    for group in get_expanded_query(hits):
        for word in group:
            word = word.replace('"', "")
            query_words.append(word)
    query_words = set(query_words)
    filter_list = filter_list.union(query_words)

    if request["collocate_distance"]:
        hits = db.query(request["q"], "proxy", int(request["collocate_distance"]), raw_results=True, **request.metadata)
    else:
        hits = db.query(request["q"], "proxy", request["arg"], raw_results=True, **request.metadata)
    hits.finish()

    stored_sentence_id = None
    stored_sentence_counts = defaultdict(int)
    sentence_hit_count = 1
    hits_done = request.start or 0
    max_time = request.max_time or 10
    all_collocates = defaultdict(lambda: {"count": 0})
    cursor = db.dbh.cursor()
    start_time = timeit.default_timer()
    try:
        for hit in hits[hits_done:]:
            word_id = " ".join([str(i) for i in hit[:6]]) + " " + str(hit[7])
            query = """select parent, rowid from words where philo_id='%s' limit 1""" % word_id
            cursor.execute(query)
            result = cursor.fetchone()
            parent = result["parent"]
            if parent != stored_sentence_id:
                rowid = int(result["rowid"])
                sentence_hit_count = 1
                stored_sentence_id = parent
                stored_sentence_counts = defaultdict(int)
                if collocate_distance:
                    begin_rowid = rowid - collocate_distance
                    if begin_rowid < 0:
                        begin_rowid = 0
                    end_rowid = rowid + collocate_distance
                    row_query = """select philo_name from words where parent='%s' and rowid between %d and %d""" % (
                        parent,
                        begin_rowid,
                        end_rowid,
                    )
                else:
                    row_query = """select philo_name from words where parent='%s'""" % (parent,)
                cursor.execute(row_query)
                for i in cursor:
                    collocate = i["philo_name"]
                    if collocate not in filter_list:
                        stored_sentence_counts[collocate] += 1
            else:
                sentence_hit_count += 1
            for word in stored_sentence_counts:
                if stored_sentence_counts[word] < sentence_hit_count:
                    continue
                all_collocates[word]["count"] += 1
            hits_done += 1
            elapsed = timeit.default_timer() - start_time
            # avoid timeouts by splitting the query if more than request.max_time (in
            # seconds) has been spent in the loop
            if elapsed > int(max_time):
                break
    except IndexError:
        collocation_object["hits_done"] = len(hits)

    collocation_object["collocates"] = all_collocates
    collocation_object["results_length"] = len(hits)
    if hits_done < collocation_object["results_length"]:
        collocation_object["more_results"] = True
        collocation_object["hits_done"] = hits_done
    else:
        collocation_object["more_results"] = False
        collocation_object["hits_done"] = collocation_object["results_length"]

    return collocation_object
コード例 #37
0
def group_by_range(request_range, request, config):
    db = DB(config.db_path + "/data/")
    metadata_queried = request.group_by_field
    citation_types = json.loads(request.citation)
    is_date = False
    try:
        int(request_range[0])
        int(request_range[1])
        is_date = True
    except ValueError:
        pass

    cursor = db.dbh.cursor()
    if is_date:
        content_type = "date"
        query_range = set(range(int(request_range[0]), int(request_range[1])))
        cursor.execute('select * from toms where philo_type="doc"')
    else:
        content_type = metadata_queried
        query_range = set(
            range(ord(request_range[0]),
                  ord(request_range[1]) +
                  1))  # Ordinal avoids unicode issues...
        cursor.execute(
            'select *, count(*) as count from toms where philo_type="doc" group by %s'
            % metadata_queried)
    try:
        cursor.execute(
            'select *, count(*) as count from toms where philo_type="doc" group by %s'
            % metadata_queried)
    except sqlite3.OperationalError:
        return json.dumps({
            "display_count": request.display_count,
            "content_type": content_type,
            "content": []
        })
    content = {}
    date_count = defaultdict(int)
    for doc in cursor:
        normalized_test_value = ""
        if doc[metadata_queried] is None:
            continue
        if is_date:
            try:
                initial = int(doc[metadata_queried])
                test_value = initial
                date_count[initial] += 1
            except:
                continue
        else:
            try:
                initial_letter = doc[metadata_queried][0].lower()
            except IndexError:
                # we have an empty string
                continue
            try:
                test_value = ord(initial_letter)
                normalized_test_value = ord("".join([
                    i for i in unicodedata.normalize("NFKD", initial_letter)
                    if not unicodedata.combining(i)
                ]))
            except TypeError:
                continue
            initial = initial_letter.upper()
        # Are we within the range?
        if test_value in query_range or normalized_test_value in query_range:
            if normalized_test_value in query_range:
                initial = "".join([
                    i for i in unicodedata.normalize("NFKD", initial_letter)
                    if not unicodedata.combining(i)
                ]).upper()
            obj = db[doc["philo_id"]]
            links = citation_links(db, config, obj)
            citation = citations(obj,
                                 links,
                                 config,
                                 report="landing_page",
                                 citation_type=citation_types)
            if initial not in content:
                content[initial] = []
            if is_date:
                try:
                    normalized_field = unaccent.smash_accents(
                        doc["title"]).lower()
                except:
                    normalized_field = None
                content[initial].append({
                    "metadata": get_all_metadata(db, doc),
                    "citation": citation,
                    "count": date_count[initial],
                    "normalized": normalized_field,
                })
            else:
                content[initial].append({
                    "metadata":
                    get_all_metadata(db, doc),
                    "citation":
                    citation,
                    "count":
                    doc["count"],
                    "normalized":
                    unaccent.smash_accents(doc[metadata_queried]).lower(),
                })
    results = []
    for prefix, result_set in sorted(content.items(), key=itemgetter(0)):
        results.append({
            "prefix":
            prefix,
            "results":
            sorted(result_set, key=lambda x: x["normalized"])
        })
    return json.dumps({
        "display_count": request.display_count,
        "content_type": content_type,
        "content": results
    })
コード例 #38
0
def landing_page_bibliography(request, config):
    db = DB(config.db_path + "/data/")
    object_level = request.object_level
    if object_level and object_level in ["doc", "div1", "div2", "div3"]:
        hits = db.get_all(object_level)
    else:
        hits = db.get_all(db.locals["default_object_level"])
    results = []
    c = db.dbh.cursor()
    for hit in hits:
        hit_object = {}
        for field in db.locals["metadata_fields"]:
            hit_object[field] = hit[field] or ""
        if object_level == "doc":
            hit_object["philo_id"] = hit.philo_id[0]
        else:
            hit_object["philo_id"] = "/".join([str(i) for i in hit.philo_id])
        doc_id = str(hit.philo_id[0]) + " 0 0 0 0 0 0"
        next_doc_id = str(hit.philo_id[0] + 1) + " 0 0 0 0 0 0"
        c.execute('select rowid from toms where philo_id="%s"' % doc_id)
        doc_row = c.fetchone()["rowid"]
        c.execute('select rowid from toms where philo_id="%s"' % next_doc_id)
        try:
            next_doc_row = c.fetchone()["rowid"]
        except TypeError:  # if this is the last doc, just get the last rowid in the table.
            c.execute("select max(rowid) from toms;")
            next_doc_row = c.fetchone()[0]
        try:
            c.execute(
                'select * from toms where rowid between %d and %d and head is not null and head !="" limit 1'
                % (doc_row, next_doc_row)
            )
        except sqlite3.OperationalError:  # no type field in DB
            c.execute(
                'select * from toms where rowid between ? and ? and head is not null and head !="" limit 1',
                (doc_row, next_doc_row),
            )
        try:
            start_head = c.fetchone()["head"].decode("utf-8")
            start_head = start_head.lower().title().encode("utf-8")
        except Exception as e:
            print(repr(e), file=sys.stderr)
            start_head = ""
        try:
            c.execute(
                'select head from toms where rowid between %d and %d and head is not null and head !="" order by rowid desc limit 1'
                % (doc_row, next_doc_row)
            )
        except sqlite3.OperationalError:  # no type field in DB
            c.execute(
                'select head from toms where rowid between %d and %d and head is not null and head !="" order by rowid desc limit 1'
                % (doc_row, next_doc_row)
            )
        try:
            end_head = c.fetchone()["head"]
            end_head = end_head.decode("utf-8").lower().title().encode("utf-8")
        except:
            end_head = ""
        hit_object["start_head"] = start_head
        hit_object["end_head"] = end_head

        results.append(hit_object)
    return results
コード例 #39
0
def check_access(environ, config):
    db = DB(config.db_path + "/data/")
    incoming_address, match_domain = get_client_info(environ)

    if config.access_file:
        if os.path.isabs(config.access_file):
            access_file = config.access_file
        else:
            access_file = os.path.join(config.db_path, "data",
                                       config.access_file)
        if not os.path.isfile(access_file):
            print(
                f"ACCESS FILE DOES NOT EXIST. UNAUTHORIZED ACCESS TO: {incoming_address} from domain {match_domain}",
                file=sys.stderr,
            )
            return ()
    else:
        print("UNAUTHORIZED ACCESS TO: %s from domain %s" %
              (incoming_address, match_domain),
              file=sys.stderr)
        return ()

    # Load access config file. If loading fails, don't grant access.
    try:
        access_config = load_module("access_config", access_file)
    except Exception as e:
        print("ACCESS ERROR", repr(e), file=sys.stderr)
        print("UNAUTHORIZED ACCESS TO: %s from domain %s" %
              (incoming_address, match_domain),
              file=sys.stderr)
        return ()

    # Let's first check if the IP is local and grant access if it is.
    for ip_range in ip_ranges:
        if ip_range.search(incoming_address):
            return make_token(incoming_address, db)

    try:
        domain_list = set(access_config.domain_list)
    except:
        domain_list = []

    try:
        allowed_ips = set([])
        for ip in access_config.allowed_ips:
            split_numbers = ip.split(".")
            if len(split_numbers) == 4:
                if re.search(r"\d+-\d+", split_numbers[3]):
                    for last_num in range(
                            int(split_numbers[3].split("-")[0]),
                            int(split_numbers[3].split("-")[1]) + 1):
                        allowed_ips.add(".".join(split_numbers[:3]) + "." +
                                        str(last_num))
                elif re.search(r"\d+-\A", split_numbers[3]):
                    for last_num in range(int(split_numbers[3].split("-")[0]),
                                          255):
                        allowed_ips.add(".".join(split_numbers[:3]) + "." +
                                        str(last_num))
                else:
                    allowed_ips.add(ip)
            else:
                allowed_ips.add(ip)
    except Exception as e:
        print(repr(e), file=sys.stderr)
        allowed_ips = []
    try:
        blocked_ips = set(access_config.blocked_ips)
    except:
        blocked_ips = []

    if incoming_address not in blocked_ips:
        if match_domain in domain_list:
            return make_token(incoming_address, db)
        else:
            for domain in domain_list:
                if domain in match_domain:
                    return make_token(incoming_address, db)
        for ip_range in allowed_ips:
            if re.search(r"^%s.*" % ip_range, incoming_address):
                print("PASS", file=sys.stderr)
                return make_token(incoming_address, db)

    # If no token returned, we block access.
    print("UNAUTHORIZED ACCESS TO: %s from domain %s" %
          (incoming_address, match_domain),
          file=sys.stderr)
    return ()
コード例 #40
0
def get_neighboring_words(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)

    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)

    try:
        index = int(request.hits_done)
    except:
        index = 0

    max_time = int(request.max_time)

    kwic_words = []
    start_time = timeit.default_timer()
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    cursor = db.dbh.cursor()

    for hit in hits[index:]:
        word_id = ' '.join([str(i) for i in hit.philo_id])
        query = 'select rowid, philo_name, parent from words where philo_id="%s" limit 1' % word_id
        cursor.execute(query)
        results = cursor.fetchone()

        highlighted_text = kwic_hit_object(hit, config, db)["highlighted_text"]
        highlighted_text = highlighted_text.translate(remove_punctuation_map)
        highlighted_text = highlighted_text.strip()

        result_obj = {
            "left": "",
            "right": "",
            "index": index,
            "q": highlighted_text
        }

        left_rowid = results["rowid"] - 10
        right_rowid = results["rowid"] + 10

        cursor.execute('select philo_name, philo_id from words where rowid between ? and ?',
                       (left_rowid, results['rowid']-1))
        result_obj["left"] = []
        for i in cursor:
            result_obj["left"].append(i['philo_name'])
        result_obj["left"].reverse()
        result_obj["left"] = ' '.join(result_obj["left"])

        cursor.execute('select philo_name, philo_id from words where rowid between ? and ?',
                       (results['rowid']+1, right_rowid))
        result_obj["right"] = []
        for i in cursor:
            result_obj["right"].append(i['philo_name'])
        result_obj["right"] = ' '.join(result_obj["right"])

        for metadata in config.kwic_metadata_sorting_fields:
            result_obj[metadata] = hit[metadata].lower()

        kwic_words.append(result_obj)

        index += 1

        elapsed = timeit.default_timer() - start_time
        if elapsed > max_time:  # avoid timeouts by splitting the query if more than 10 seconds has been spent in the loop
            break

    yield json.dumps({"results": kwic_words, "hits_done": index}).encode('utf8')
コード例 #41
0
def generate_time_series(request, config):
    db = DB(config.db_path + "/data/")
    time_series_object = {"query": dict([i for i in request]), "query_done": False}

    # Invalid date range
    if request.start_date == "invalid" or request.end_date == "invalid":
        time_series_object["results_length"] = 0
        time_series_object["more_results"] = False
        time_series_object["new_start_date"] = 0
        time_series_object["results"] = {"absolute_count": {}, "date_count": {}}
        return time_series_object

    start_date, end_date = get_start_end_date(
        db, config, start_date=request.start_date or None, end_date=request.end_date or None
    )

    # Generate date ranges
    interval = int(request.year_interval)
    date_ranges = []
    # Make sure last date gets included in for loop below by adding one to last step
    for start in range(start_date, end_date + 1, interval):
        end = start + interval - 1
        if end > end_date:
            end = end_date
        date_range = "%d-%d" % (start, end)
        date_ranges.append((start, date_range))

    absolute_count = defaultdict(int)
    date_counts = {}
    total_hits = 0
    last_date_done = start_date
    start_time = timeit.default_timer()
    max_time = request.max_time or 10
    for start_range, date_range in date_ranges:
        request.metadata[config.time_series_year_field] = date_range
        hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata)
        hits.finish()
        hit_len = len(hits)
        params = {"report": "concordance", "start": "0", "end": "0"}
        params[config.time_series_year_field] = date_range
        url = make_absolute_query_link(config, request, **params)
        absolute_count[start_range] = {"label": start_range, "count": hit_len, "url": url}

        # Get date total count
        if interval != "1":
            end_range = start_range + (int(request["year_interval"]) - 1)
            query = 'select sum(word_count) from toms where %s between "%d" and "%d"' % (
                config.time_series_year_field,
                start_range,
                end_range,
            )
        else:
            query = "select sum(word_count) from toms where %s='%s'" % (config.time_series_year_field, start_range)

        cursor = db.dbh.cursor()
        cursor.execute(query)
        date_counts[start_range] = cursor.fetchone()[0] or 0
        total_hits += hit_len
        elapsed = timeit.default_timer() - start_time
        last_date_done = start_range
        # avoid timeouts by splitting the query if more than request.max_time
        # (in seconds) has been spent in the loop
        if elapsed > int(max_time):
            break

    time_series_object["results_length"] = total_hits
    if (last_date_done + int(request.year_interval)) >= end_date:
        time_series_object["more_results"] = False
    else:
        time_series_object["more_results"] = True
        time_series_object["new_start_date"] = last_date_done + int(request.year_interval)
    time_series_object["results"] = {"absolute_count": absolute_count, "date_count": date_counts}

    return time_series_object
コード例 #42
0
def filter_words_by_property(request, config):
    """Filter words by property"""
    db = DB(config.db_path + "/data/")
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    concordance_object = {"query": dict([i for i in request])}

    # Do these need to be captured in wsgi_handler?
    word_property = request["word_property"]
    word_property_value = request["word_property_value"]
    word_property_total = request["word_property_total"]

    new_hitlist = []
    results = []
    position = 0
    more_pages = False

    if request.start == 0:
        start = 1
    else:
        start = request.start

    for hit in hits:
        # get my chunk of text
        hit_val = get_word_attrib(hit, word_property, db)

        if hit_val == word_property_value:
            position += 1
            if position < start:
                continue
            new_hitlist.append(hit)
            citation_hrefs = citation_links(db, config, hit)
            metadata_fields = {}
            for metadata in db.locals["metadata_fields"]:
                metadata_fields[metadata] = hit[metadata]
            citation = citations(hit, citation_hrefs, config)
            context = get_concordance_text(db, hit, config.db_path, config.concordance_length)
            result_obj = {
                "philo_id": hit.philo_id,
                "citation": citation,
                "citation_links": citation_hrefs,
                "context": context,
                "metadata_fields": metadata_fields,
                "bytes": hit.bytes,
                "collocate_count": 1,
            }
            results.append(result_obj)

        if len(new_hitlist) == (request.results_per_page):
            more_pages = True
            break

    end = start + len(results) - 1
    if len(results) < request.results_per_page:
        word_property_total = end
    else:
        word_property_total = end + 1
    concordance_object["results"] = results
    concordance_object["query_done"] = hits.done
    concordance_object["results_length"] = word_property_total
    concordance_object["description"] = {
        "start": start,
        "end": end,
        "results_per_page": request.results_per_page,
        "more_pages": more_pages,
    }
    return concordance_object