def concordance_results(request, config):
    """Fetch concordances results."""
    db = DB(config.db_path + "/data/")
    if request.collocation_type:
        first_hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
        second_hits = db.query(request["left"], request["method"], request["arg"], **request.metadata)
        hits = CombinedHitlist(first_hits, second_hits)
    else:
        hits = db.query(
            request["q"], request["method"], request["arg"], sort_order=request["sort_order"], **request.metadata
        )
    start, end, page_num = page_interval(request["results_per_page"], hits, request.start, request.end)

    concordance_object = {
        "description": {"start": start, "end": end, "results_per_page": request.results_per_page},
        "query": dict([i for i in request]),
        "default_object": db.locals["default_object_level"],
    }

    formatting_regexes = []
    if config.concordance_formatting_regex:
        for pattern, replacement in config.concordance_formatting_regex:
            compiled_regex = re.compile(r"%s" % pattern)
            formatting_regexes.append((compiled_regex, replacement))
    results = []
    for hit in hits[start - 1 : end]:
        citation_hrefs = citation_links(db, config, hit)
        metadata_fields = {}
        for metadata in db.locals["metadata_fields"]:
            metadata_fields[metadata] = hit[metadata]
        citation = citations(hit, citation_hrefs, config, report="concordance")
        context = get_concordance_text(db, hit, config.db_path, config.concordance_length)
        if formatting_regexes:
            for formatting_regex, replacement in formatting_regexes:
                context = formatting_regex.sub(r"%s" % replacement, context)
        result_obj = {
            "philo_id": hit.philo_id,
            "citation": citation,
            "citation_links": citation_hrefs,
            "context": context,
            "metadata_fields": metadata_fields,
            "bytes": hit.bytes,
        }
        results.append(result_obj)
    concordance_object["results"] = results
    concordance_object["results_length"] = len(hits)
    concordance_object["query_done"] = hits.done
    return concordance_object
def term_group(environ, start_response):
    status = "200 OK"
    headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace("scripts", ""))
    db = DB(config.db_path + "/data/")
    request = WSGIHandler(environ, config)
    if not request["q"]:
        dump = json.dumps({"original_query": "", "term_groups": []})
    else:
        hits = db.query(
            request["q"], request["method"], request["arg"], sort_order=request["sort_order"], **request.metadata
        )
        parsed = parse_query(request.q)
        group = group_terms(parsed)
        all_groups = split_terms(group)
        term_groups = []
        for g in all_groups:
            term_group = ""
            not_started = False
            for kind, term in g:
                if kind == "NOT":
                    if not_started is False:
                        not_started = True
                        term_group += " NOT "
                elif kind == "OR":
                    term_group += "|"
                elif kind == "TERM":
                    term_group += " %s " % term
                elif kind == "QUOTE":
                    term_group += " %s " % term
            term_group = term_group.strip()
            term_groups.append(term_group)
        dump = json.dumps({"term_groups": term_groups, "original_query": request.original_q})
    yield dump.encode("utf8")
def lookup_word_service(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    cursor = db.dbh.cursor()

    if request.report == "concordance":
        hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
        context_size = config['concordance_length'] * 3
        hit = hits[int(request.position)]
        bytes = hit.bytes
        hit_span = hit.bytes[-1] - hit.bytes[0]
        length = context_size + hit_span + context_size
        bytes, start_byte = adjust_bytes(bytes, length)
        end_byte = start_byte + length
        filename = hit.filename
        token = request.selected
    elif request.report == "navigation":

        token = request.selected
        philo_id = request.philo_id.split(" ")
        text_obj = db[philo_id]
        start_byte, end_byte = int(text_obj.start_byte), int(text_obj.end_byte)
        filename = text_obj.filename
#        print >> sys.stderr, "WORD LOOKUP FROM NAVIGATION", request.philo_id,request.selected, start_byte, end_byte, filename
    else:
        pass
#    print >> sys.stderr, "TOKEN", token, "BYTES: ", start_byte, end_byte, "FILENAME: ", filename, "POSITION", request.position
    token_n = 0
    yield lookup_word(db, cursor, token, token_n, start_byte, end_byte, filename).encode('utf8')
def bibliography_results(request, config):
    """Fetch bibliography results"""
    db = DB(config.db_path + "/data/")
    if request.no_metadata:
        hits = db.get_all(db.locals["default_object_level"], request["sort_order"])
    else:
        hits = db.query(sort_order=request["sort_order"], **request.metadata)
    if (
        request.simple_bibliography == "all"
    ):  # request from simple landing page report which gets all biblio in load order
        hits.finish()
        start = 1
        end = len(hits)
        page_num = end
    else:
        start, end, page_num = page_interval(request.results_per_page, hits, request.start, request.end)
    bibliography_object = {
        "description": {"start": start, "end": end, "n": page_num, "results_per_page": request.results_per_page},
        "query": dict([i for i in request]),
        "default_object": db.locals["default_object_level"],
    }
    results = []
    result_type = "doc"
    for hit in hits[start - 1 : end]:
        citation_hrefs = citation_links(db, config, hit)
        metadata_fields = {}
        for metadata in db.locals["metadata_fields"]:
            metadata_fields[metadata] = hit[metadata]
        result_type = hit.object_type
        if request.simple_bibliography == "all":
            citation = citations(hit, citation_hrefs, config, report="simple_landing")
        else:
            citation = citations(hit, citation_hrefs, config, report="bibliography", result_type=result_type)
        if config.dictionary_bibliography is False or result_type == "doc":
            results.append(
                {
                    "citation": citation,
                    "citation_links": citation_hrefs,
                    "philo_id": hit.philo_id,
                    "metadata_fields": metadata_fields,
                    "object_type": result_type,
                }
            )
        else:
            context = get_text_obj(hit, config, request, db.locals["token_regex"], images=False)
            results.append(
                {
                    "citation": citation,
                    "citation_links": citation_hrefs,
                    "philo_id": hit.philo_id,
                    "metadata_fields": metadata_fields,
                    "context": context,
                    "object_type": result_type,
                }
            )
    bibliography_object["results"] = results
    bibliography_object["results_length"] = len(hits)
    bibliography_object["query_done"] = hits.done
    bibliography_object["result_type"] = result_type
    return bibliography_object, hits
Beispiel #5
0
def kwic_results(request, config):
    """Fetch KWIC results"""
    db = DB(config.db_path + "/data/")
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    start, end, n = page_interval(request.results_per_page, hits,
                                  request.start, request.end)
    kwic_object = {
        "description": {
            "start": start,
            "end": end,
            "results_per_page": request.results_per_page
        },
        "query": dict([i for i in request]),
    }
    kwic_object["results"] = []

    for hit in hits[start - 1:end]:
        kwic_result = kwic_hit_object(hit, config, db)
        kwic_object["results"].append(kwic_result)

    kwic_object["results_length"] = len(hits)
    kwic_object["query_done"] = hits.done

    return kwic_object
Beispiel #6
0
def get_start_end_date(environ, start_response):
    status = "200 OK"
    headers = [("Content-type", "text/html; charset=UTF-8"),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace("scripts", ""))
    db = DB(config.db_path + "/data/")
    request = WSGIHandler(environ, config)
    start_date, end_date = start_end_date(db,
                                          config,
                                          start_date=request.start_date,
                                          end_date=request.end_date)
    request.metadata["year"] = "{}-{}".format(start_date, end_date)
    request["start_date"] = ""
    request["end_date"] = ""
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    total_results = 0
    hits.finish()
    total_results = len(hits)
    yield json.dumps({
        "start_date": start_date,
        "end_date": end_date,
        "total_results": total_results
    }).encode("utf8")
def term_list(environ, start_response):
    status = "200 OK"
    headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace("scripts", ""))
    db = DB(config.db_path + "/data/")
    request = WSGIHandler(environ, config)
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    hits.finish()
    expanded_terms = get_expanded_query(hits)
    yield json.dumps(expanded_terms[0]).encode("utf8")
def generate_word_frequency(request, config):
    """reads through a hitlist. looks up request["field"] in each hit, and builds up a list of
       unique values and their frequencies."""
    db = DB(config.db_path + "/data/")
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    field = request["field"]
    counts = {}
    frequency_object = {}
    start_time = timeit.default_timer()
    last_hit_done = request.start
    try:
        for n in hits[request.start:]:
            key = get_word_attrib(n, field, db)
            if not key:
                # NULL is a magic value for queries, don't change it
                # recklessly.
                key = "NULL"
            if key not in counts:
                counts[key] = 0
            counts[key] += 1
            elapsed = timeit.default_timer() - start_time
            last_hit_done += 1
            if elapsed > 5:
                break

        table = {}
        for k, v in counts.items():
            url = make_absolute_query_link(
                config,
                request,
                start="0",
                end="0",
                report="word_property_filter",
                word_property=field,
                word_property_value=k,
            )
            table[k] = {"count": v, "url": url}

        frequency_object["results"] = table
        frequency_object["hits_done"] = last_hit_done
        if last_hit_done == len(hits):
            frequency_object["more_results"] = False
        else:
            frequency_object["more_results"] = True

    except IndexError:
        frequency_object["results"] = {}
        frequency_object["more_results"] = False

    frequency_object["results_length"] = len(hits)
    frequency_object["query"] = dict([i for i in request])

    return frequency_object
def get_total_results(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    if request.no_q:
        if request.no_metadata:
            hits = db.get_all(db.locals['default_object_level'], request["sort_order"])
        else:
            hits = db.query(sort_order=request["sort_order"], **request.metadata)
    else:
        hits = db.query(request["q"], request["method"], request["arg"],
                        **request.metadata)
    total_results = 0
    hits.finish()
    total_results = len(hits)

    yield json.dumps(total_results).encode('utf8')
def generate_word_frequency(request, config):
    """reads through a hitlist. looks up request["field"] in each hit, and builds up a list of
       unique values and their frequencies."""
    db = DB(config.db_path + "/data/")
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    field = request["field"]
    counts = {}
    frequency_object = {}
    start_time = timeit.default_timer()
    last_hit_done = request.start
    try:
        for n in hits[request.start :]:
            key = get_word_attrib(n, field, db)
            if not key:
                # NULL is a magic value for queries, don't change it
                # recklessly.
                key = "NULL"
            if key not in counts:
                counts[key] = 0
            counts[key] += 1
            elapsed = timeit.default_timer() - start_time
            last_hit_done += 1
            if elapsed > 5:
                break

        table = {}
        for k, v in counts.items():
            url = make_absolute_query_link(
                config,
                request,
                start="0",
                end="0",
                report="word_property_filter",
                word_property=field,
                word_property_value=k,
            )
            table[k] = {"count": v, "url": url}

        frequency_object["results"] = table
        frequency_object["hits_done"] = last_hit_done
        if last_hit_done == len(hits):
            frequency_object["more_results"] = False
        else:
            frequency_object["more_results"] = True

    except IndexError:
        frequency_object["results"] = {}
        frequency_object["more_results"] = False

    frequency_object["results_length"] = len(hits)
    frequency_object["query"] = dict([i for i in request])

    return frequency_object
def term_list(environ, start_response):
    status = "200 OK"
    headers = [("Content-type", "application/json; charset=UTF-8"),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace("scripts", ""))
    db = DB(config.db_path + "/data/")
    request = WSGIHandler(environ, config)
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    hits.finish()
    expanded_terms = get_expanded_query(hits)
    yield json.dumps(expanded_terms[0]).encode("utf8")
def get_more_context(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    hit_num = int(request.hit_num)
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    context_size = config['concordance_length'] * 3
    hit_context = get_concordance_text(db, hits[hit_num], config.db_path,
                                       context_size)
    yield json.dumps(hit_context).encode('utf8')
Beispiel #13
0
def get_total_results(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    if request.no_q:
        if request.no_metadata:
            hits = db.get_all(db.locals['default_object_level'],
                              request["sort_order"])
        else:
            hits = db.query(sort_order=request["sort_order"],
                            **request.metadata)
    else:
        hits = db.query(request["q"], request["method"], request["arg"],
                        **request.metadata)
    total_results = 0
    hits.finish()
    total_results = len(hits)

    yield json.dumps(total_results).encode('utf8')
def get_more_context(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    hit_num = int(request.hit_num)
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    context_size = config['concordance_length'] * 3
    hit_context = get_concordance_text(db, hits[hit_num], config.db_path,
                                       context_size)
    yield json.dumps(hit_context).encode('utf8')
def get_start_end_date(environ, start_response):
    status = "200 OK"
    headers = [("Content-type", "text/html; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace("scripts", ""))
    db = DB(config.db_path + "/data/")
    request = WSGIHandler(environ, config)
    start_date, end_date = start_end_date(db, config, start_date=request.start_date, end_date=request.end_date)
    request.metadata["year"] = "{}-{}".format(start_date, end_date)
    request["start_date"] = ""
    request["end_date"] = ""
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    total_results = 0
    hits.finish()
    total_results = len(hits)
    yield json.dumps({"start_date": start_date, "end_date": end_date, "total_results": total_results}).encode("utf8")
Beispiel #16
0
def kwic_results(request, config):
    """Fetch KWIC results"""
    db = DB(config.db_path + "/data/")
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    start, end, n = page_interval(request.results_per_page, hits, request.start, request.end)
    kwic_object = {
        "description": {"start": start, "end": end, "results_per_page": request.results_per_page},
        "query": dict([i for i in request]),
    }
    kwic_object["results"] = []

    for hit in hits[start - 1 : end]:
        kwic_result = kwic_hit_object(hit, config, db)
        kwic_object["results"].append(kwic_result)

    kwic_object["results_length"] = len(hits)
    kwic_object["query_done"] = hits.done

    return kwic_object
def lookup_word_service(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    cursor = db.dbh.cursor()

    if request.report == "concordance":
        hits = db.query(request["q"], request["method"], request["arg"],
                        **request.metadata)
        context_size = config['concordance_length'] * 3
        hit = hits[int(request.position)]
        bytes = hit.bytes
        hit_span = hit.bytes[-1] - hit.bytes[0]
        length = context_size + hit_span + context_size
        bytes, start_byte = adjust_bytes(bytes, length)
        end_byte = start_byte + length
        filename = hit.filename
        token = request.selected
    elif request.report == "navigation":

        token = request.selected
        philo_id = request.philo_id.split(" ")
        text_obj = db[philo_id]
        start_byte, end_byte = int(text_obj.start_byte), int(text_obj.end_byte)
        filename = text_obj.filename
#        print >> sys.stderr, "WORD LOOKUP FROM NAVIGATION", request.philo_id,request.selected, start_byte, end_byte, filename
    else:
        pass


#    print >> sys.stderr, "TOKEN", token, "BYTES: ", start_byte, end_byte, "FILENAME: ", filename, "POSITION", request.position
    token_n = 0
    yield lookup_word(db, cursor, token, token_n, start_byte, end_byte,
                      filename).encode('utf8')
Beispiel #18
0
def get_neighboring_words(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)

    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)

    try:
        index = int(request.hits_done)
    except:
        index = 0

    max_time = int(request.max_time)

    kwic_words = []
    start_time = timeit.default_timer()
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    cursor = db.dbh.cursor()

    for hit in hits[index:]:
        word_id = ' '.join([str(i) for i in hit.philo_id])
        query = 'select rowid, philo_name, parent from words where philo_id="%s" limit 1' % word_id
        cursor.execute(query)
        results = cursor.fetchone()

        highlighted_text = kwic_hit_object(hit, config, db)["highlighted_text"]
        highlighted_text = highlighted_text.translate(remove_punctuation_map)
        highlighted_text = highlighted_text.strip()

        result_obj = {
            "left": "",
            "right": "",
            "index": index,
            "q": highlighted_text
        }

        left_rowid = results["rowid"] - 10
        right_rowid = results["rowid"] + 10

        cursor.execute(
            'select philo_name, philo_id from words where rowid between ? and ?',
            (left_rowid, results['rowid'] - 1))
        result_obj["left"] = []
        for i in cursor:
            result_obj["left"].append(i['philo_name'])
        result_obj["left"].reverse()
        result_obj["left"] = ' '.join(result_obj["left"])

        cursor.execute(
            'select philo_name, philo_id from words where rowid between ? and ?',
            (results['rowid'] + 1, right_rowid))
        result_obj["right"] = []
        for i in cursor:
            result_obj["right"].append(i['philo_name'])
        result_obj["right"] = ' '.join(result_obj["right"])

        for metadata in config.kwic_metadata_sorting_fields:
            result_obj[metadata] = hit[metadata].lower()

        kwic_words.append(result_obj)

        index += 1

        elapsed = timeit.default_timer() - start_time
        if elapsed > max_time:  # avoid timeouts by splitting the query if more than 10 seconds has been spent in the loop
            break

    yield json.dumps({
        "results": kwic_words,
        "hits_done": index
    }).encode('utf8')
Beispiel #19
0
def filter_words_by_property(request, config):
    """Filter words by property"""
    db = DB(config.db_path + "/data/")
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    concordance_object = {"query": dict([i for i in request])}

    # Do these need to be captured in wsgi_handler?
    word_property = request["word_property"]
    word_property_value = request["word_property_value"]
    word_property_total = request["word_property_total"]

    new_hitlist = []
    results = []
    position = 0
    more_pages = False

    if request.start == 0:
        start = 1
    else:
        start = request.start

    for hit in hits:
        # get my chunk of text
        hit_val = get_word_attrib(hit, word_property, db)

        if hit_val == word_property_value:
            position += 1
            if position < start:
                continue
            new_hitlist.append(hit)
            citation_hrefs = citation_links(db, config, hit)
            metadata_fields = {}
            for metadata in db.locals["metadata_fields"]:
                metadata_fields[metadata] = hit[metadata]
            citation = citations(hit, citation_hrefs, config)
            context = get_concordance_text(db, hit, config.db_path, config.concordance_length)
            result_obj = {
                "philo_id": hit.philo_id,
                "citation": citation,
                "citation_links": citation_hrefs,
                "context": context,
                "metadata_fields": metadata_fields,
                "bytes": hit.bytes,
                "collocate_count": 1,
            }
            results.append(result_obj)

        if len(new_hitlist) == (request.results_per_page):
            more_pages = True
            break

    end = start + len(results) - 1
    if len(results) < request.results_per_page:
        word_property_total = end
    else:
        word_property_total = end + 1
    concordance_object["results"] = results
    concordance_object["query_done"] = hits.done
    concordance_object["results_length"] = word_property_total
    concordance_object["description"] = {
        "start": start,
        "end": end,
        "results_per_page": request.results_per_page,
        "more_pages": more_pages,
    }
    return concordance_object
def collocation_results(request, config):
    """Fetch collocation results"""
    db = DB(config.db_path + "/data/")
    if request["collocate_distance"]:
        hits = db.query(request["q"], "proxy", int(request["collocate_distance"]), **request.metadata)
    else:
        hits = db.query(request["q"], "cooc", request["arg"], **request.metadata)
    hits.finish()
    collocation_object = {"query": dict([i for i in request])}

    try:
        collocate_distance = int(request["collocate_distance"])
    except ValueError:  # Getting an empty string since the keyword is not specificed in the URL
        collocate_distance = None

    if request.colloc_filter_choice == "nofilter":
        filter_list = []
    else:
        filter_list = build_filter_list(request, config)
    collocation_object["filter_list"] = filter_list
    filter_list = set(filter_list)

    # Build list of search terms to filter out
    query_words = []
    for group in get_expanded_query(hits):
        for word in group:
            word = word.replace('"', "")
            query_words.append(word)
    query_words = set(query_words)
    filter_list = filter_list.union(query_words)

    if request["collocate_distance"]:
        hits = db.query(request["q"], "proxy", int(request["collocate_distance"]), raw_results=True, **request.metadata)
    else:
        hits = db.query(request["q"], "proxy", request["arg"], raw_results=True, **request.metadata)
    hits.finish()

    stored_sentence_id = None
    stored_sentence_counts = defaultdict(int)
    sentence_hit_count = 1
    hits_done = request.start or 0
    max_time = request.max_time or 10
    all_collocates = defaultdict(lambda: {"count": 0})
    cursor = db.dbh.cursor()
    start_time = timeit.default_timer()
    try:
        for hit in hits[hits_done:]:
            word_id = " ".join([str(i) for i in hit[:6]]) + " " + str(hit[7])
            query = """select parent, rowid from words where philo_id='%s' limit 1""" % word_id
            cursor.execute(query)
            result = cursor.fetchone()
            parent = result["parent"]
            if parent != stored_sentence_id:
                rowid = int(result["rowid"])
                sentence_hit_count = 1
                stored_sentence_id = parent
                stored_sentence_counts = defaultdict(int)
                if collocate_distance:
                    begin_rowid = rowid - collocate_distance
                    if begin_rowid < 0:
                        begin_rowid = 0
                    end_rowid = rowid + collocate_distance
                    row_query = """select philo_name from words where parent='%s' and rowid between %d and %d""" % (
                        parent,
                        begin_rowid,
                        end_rowid,
                    )
                else:
                    row_query = """select philo_name from words where parent='%s'""" % (parent,)
                cursor.execute(row_query)
                for i in cursor:
                    collocate = i["philo_name"]
                    if collocate not in filter_list:
                        stored_sentence_counts[collocate] += 1
            else:
                sentence_hit_count += 1
            for word in stored_sentence_counts:
                if stored_sentence_counts[word] < sentence_hit_count:
                    continue
                all_collocates[word]["count"] += 1
            hits_done += 1
            elapsed = timeit.default_timer() - start_time
            # avoid timeouts by splitting the query if more than request.max_time (in
            # seconds) has been spent in the loop
            if elapsed > int(max_time):
                break
    except IndexError:
        collocation_object["hits_done"] = len(hits)

    collocation_object["collocates"] = all_collocates
    collocation_object["results_length"] = len(hits)
    if hits_done < collocation_object["results_length"]:
        collocation_object["more_results"] = True
        collocation_object["hits_done"] = hits_done
    else:
        collocation_object["more_results"] = False
        collocation_object["hits_done"] = collocation_object["results_length"]

    return collocation_object
def generate_time_series(request, config):
    db = DB(config.db_path + "/data/")
    time_series_object = {
        "query": dict([i for i in request]),
        "query_done": False
    }

    # Invalid date range
    if request.start_date == "invalid" or request.end_date == "invalid":
        time_series_object["results_length"] = 0
        time_series_object["more_results"] = False
        time_series_object["new_start_date"] = 0
        time_series_object["results"] = {
            "absolute_count": {},
            "date_count": {}
        }
        return time_series_object

    start_date, end_date = get_start_end_date(db,
                                              config,
                                              start_date=request.start_date
                                              or None,
                                              end_date=request.end_date
                                              or None)

    # Generate date ranges
    interval = int(request.year_interval)
    date_ranges = []
    # Make sure last date gets included in for loop below by adding one to last step
    for start in range(start_date, end_date + 1, interval):
        end = start + interval - 1
        if end > end_date:
            end = end_date
        date_range = "%d-%d" % (start, end)
        date_ranges.append((start, date_range))

    absolute_count = defaultdict(int)
    date_counts = {}
    total_hits = 0
    last_date_done = start_date
    start_time = timeit.default_timer()
    max_time = request.max_time or 10
    for start_range, date_range in date_ranges:
        request.metadata[config.time_series_year_field] = date_range
        hits = db.query(request["q"],
                        request["method"],
                        request["arg"],
                        raw_results=True,
                        **request.metadata)
        hits.finish()
        hit_len = len(hits)
        params = {"report": "concordance", "start": "0", "end": "0"}
        params[config.time_series_year_field] = date_range
        url = make_absolute_query_link(config, request, **params)
        absolute_count[start_range] = {
            "label": start_range,
            "count": hit_len,
            "url": url
        }

        # Get date total count
        if interval != "1":
            end_range = start_range + (int(request["year_interval"]) - 1)
            query = 'select sum(word_count) from toms where %s between "%d" and "%d"' % (
                config.time_series_year_field,
                start_range,
                end_range,
            )
        else:
            query = "select sum(word_count) from toms where %s='%s'" % (
                config.time_series_year_field, start_range)

        cursor = db.dbh.cursor()
        cursor.execute(query)
        date_counts[start_range] = cursor.fetchone()[0] or 0
        total_hits += hit_len
        elapsed = timeit.default_timer() - start_time
        last_date_done = start_range
        # avoid timeouts by splitting the query if more than request.max_time
        # (in seconds) has been spent in the loop
        if elapsed > int(max_time):
            break

    time_series_object["results_length"] = total_hits
    if (last_date_done + int(request.year_interval)) >= end_date:
        time_series_object["more_results"] = False
    else:
        time_series_object["more_results"] = True
        time_series_object["new_start_date"] = last_date_done + int(
            request.year_interval)
    time_series_object["results"] = {
        "absolute_count": absolute_count,
        "date_count": date_counts
    }

    return time_series_object
Beispiel #22
0
def frequency_results(request, config, sorted_results=False):
    """reads through a hitlist. looks up request.frequency_field in each hit, and builds up a list of
       unique values and their frequencies."""
    db = DB(config.db_path + "/data/")
    biblio_search = False
    if request.q == "" and request.no_q:
        biblio_search = True
        if request.no_metadata:
            hits = db.get_all(db.locals["default_object_level"],
                              sort_order=["rowid"],
                              raw_results=True)
        else:
            hits = db.query(sort_order=["rowid"],
                            raw_results=True,
                            **request.metadata)
    else:
        hits = db.query(request["q"],
                        request["method"],
                        request["arg"],
                        raw_results=True,
                        **request.metadata)

    if sorted_results is True:
        hits.finish()

    cursor = db.dbh.cursor()

    cursor.execute("select philo_id, %s from toms where %s is not null" %
                   (request.frequency_field, request.frequency_field))
    metadata_dict = {}
    for i in cursor:
        philo_id, field = i
        philo_id = tuple(int(s) for s in philo_id.split() if int(s))
        metadata_dict[philo_id] = field

    counts = {}
    frequency_object = {}
    start_time = timeit.default_timer()
    last_hit_done = request.start

    obj_dict = {
        "doc": 1,
        "div1": 2,
        "div2": 3,
        "div3": 4,
        "para": 5,
        "sent": 6,
        "word": 7
    }
    metadata_type = db.locals["metadata_types"][request.frequency_field]
    try:
        object_level = obj_dict[metadata_type]
    except KeyError:
        # metadata_type == "div"
        pass

    try:
        for philo_id in hits[request.start:]:
            if not biblio_search:
                philo_id = tuple(list(philo_id[:6]) + [philo_id[7]])
            if metadata_type == "div":
                key = ""
                for div in ["div1", "div2", "div3"]:
                    if philo_id[:obj_dict[div]] in metadata_dict:
                        key = metadata_dict[philo_id[:obj_dict[div]]]
                while not key:
                    if philo_id[:4] in metadata_dict:
                        key = metadata_dict[philo_id[:4]]
                        break
                    if philo_id[:5] in metadata_dict:
                        key = metadata_dict[philo_id[:5]]
                        break
                    break
                if not key:
                    last_hit_done += 1
                    continue
            else:
                try:
                    key = metadata_dict[philo_id[:object_level]]
                except:
                    last_hit_done += 1
                    continue
            if key not in counts:
                counts[key] = {
                    "count": 0,
                    "metadata": {
                        request.frequency_field: key
                    }
                }
                counts[key]["url"] = make_absolute_query_link(
                    config,
                    request,
                    frequency_field="",
                    start="0",
                    end="0",
                    report=request.report,
                    script="",
                    **{request.frequency_field: '"%s"' % key})
                if not biblio_search:
                    query_metadata = dict([
                        (k, v) for k, v in request.metadata.items() if v
                    ])
                    query_metadata[request.frequency_field] = '"%s"' % key
                    local_hits = db.query(**query_metadata)
                    counts[key][
                        "total_word_count"] = local_hits.get_total_word_count(
                        )
            counts[key]["count"] += 1

            # avoid timeouts by splitting the query if more than
            # request.max_time (in seconds) has been spent in the loop
            elapsed = timeit.default_timer() - start_time
            last_hit_done += 1
            if elapsed > 5 and sorted_results is False:
                break

        frequency_object["results"] = counts
        frequency_object["hits_done"] = last_hit_done
        if last_hit_done == len(hits):
            new_metadata = dict([(k, v) for k, v in request.metadata.items()
                                 if v])
            new_metadata[request.frequency_field] = '"NULL"'
            if request.q == "" and request.no_q:
                new_hits = db.query(sort_order=["rowid"],
                                    raw_results=True,
                                    **new_metadata)
            else:
                new_hits = db.query(request["q"],
                                    request["method"],
                                    request["arg"],
                                    raw_results=True,
                                    **new_metadata)
            new_hits.finish()
            if len(new_hits):
                null_url = make_absolute_query_link(
                    config,
                    request,
                    frequency_field="",
                    start="0",
                    end="0",
                    report=request.report,
                    script="",
                    **{request.frequency_field: '"NULL"'})
                local_hits = db.query(**new_metadata)
                if not biblio_search:
                    frequency_object["results"]["NULL"] = {
                        "count": len(new_hits),
                        "url": null_url,
                        "metadata": {
                            request.frequency_field: '"NULL"'
                        },
                        "total_word_count": local_hits.get_total_word_count(),
                    }
                else:
                    frequency_object["results"]["NULL"] = {
                        "count": len(new_hits),
                        "url": null_url,
                        "metadata": {
                            request.frequency_field: '"NULL"'
                        },
                    }
            frequency_object["more_results"] = False
        else:
            frequency_object["more_results"] = True
    except IndexError:
        frequency_object["results"] = {}
        frequency_object["more_results"] = False
    frequency_object["results_length"] = len(hits)
    frequency_object["query"] = dict([i for i in request])

    if sorted_results is True:
        frequency_object["results"] = sorted(
            frequency_object["results"].items(),
            key=lambda x: x[1]["count"],
            reverse=True)

    return frequency_object
def generate_time_series(request, config):
    db = DB(config.db_path + "/data/")
    time_series_object = {"query": dict([i for i in request]), "query_done": False}

    # Invalid date range
    if request.start_date == "invalid" or request.end_date == "invalid":
        time_series_object["results_length"] = 0
        time_series_object["more_results"] = False
        time_series_object["new_start_date"] = 0
        time_series_object["results"] = {"absolute_count": {}, "date_count": {}}
        return time_series_object

    start_date, end_date = get_start_end_date(
        db, config, start_date=request.start_date or None, end_date=request.end_date or None
    )

    # Generate date ranges
    interval = int(request.year_interval)
    date_ranges = []
    # Make sure last date gets included in for loop below by adding one to last step
    for start in range(start_date, end_date + 1, interval):
        end = start + interval - 1
        if end > end_date:
            end = end_date
        date_range = "%d-%d" % (start, end)
        date_ranges.append((start, date_range))

    absolute_count = defaultdict(int)
    date_counts = {}
    total_hits = 0
    last_date_done = start_date
    start_time = timeit.default_timer()
    max_time = request.max_time or 10
    for start_range, date_range in date_ranges:
        request.metadata[config.time_series_year_field] = date_range
        hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata)
        hits.finish()
        hit_len = len(hits)
        params = {"report": "concordance", "start": "0", "end": "0"}
        params[config.time_series_year_field] = date_range
        url = make_absolute_query_link(config, request, **params)
        absolute_count[start_range] = {"label": start_range, "count": hit_len, "url": url}

        # Get date total count
        if interval != "1":
            end_range = start_range + (int(request["year_interval"]) - 1)
            query = 'select sum(word_count) from toms where %s between "%d" and "%d"' % (
                config.time_series_year_field,
                start_range,
                end_range,
            )
        else:
            query = "select sum(word_count) from toms where %s='%s'" % (config.time_series_year_field, start_range)

        cursor = db.dbh.cursor()
        cursor.execute(query)
        date_counts[start_range] = cursor.fetchone()[0] or 0
        total_hits += hit_len
        elapsed = timeit.default_timer() - start_time
        last_date_done = start_range
        # avoid timeouts by splitting the query if more than request.max_time
        # (in seconds) has been spent in the loop
        if elapsed > int(max_time):
            break

    time_series_object["results_length"] = total_hits
    if (last_date_done + int(request.year_interval)) >= end_date:
        time_series_object["more_results"] = False
    else:
        time_series_object["more_results"] = True
        time_series_object["new_start_date"] = last_date_done + int(request.year_interval)
    time_series_object["results"] = {"absolute_count": absolute_count, "date_count": date_counts}

    return time_series_object
def get_neighboring_words(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)

    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)

    try:
        index = int(request.hits_done)
    except:
        index = 0

    max_time = int(request.max_time)

    kwic_words = []
    start_time = timeit.default_timer()
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    cursor = db.dbh.cursor()

    for hit in hits[index:]:
        word_id = ' '.join([str(i) for i in hit.philo_id])
        query = 'select rowid, philo_name, parent from words where philo_id="%s" limit 1' % word_id
        cursor.execute(query)
        results = cursor.fetchone()

        highlighted_text = kwic_hit_object(hit, config, db)["highlighted_text"]
        highlighted_text = highlighted_text.translate(remove_punctuation_map)
        highlighted_text = highlighted_text.strip()

        result_obj = {
            "left": "",
            "right": "",
            "index": index,
            "q": highlighted_text
        }

        left_rowid = results["rowid"] - 10
        right_rowid = results["rowid"] + 10

        cursor.execute('select philo_name, philo_id from words where rowid between ? and ?',
                       (left_rowid, results['rowid']-1))
        result_obj["left"] = []
        for i in cursor:
            result_obj["left"].append(i['philo_name'])
        result_obj["left"].reverse()
        result_obj["left"] = ' '.join(result_obj["left"])

        cursor.execute('select philo_name, philo_id from words where rowid between ? and ?',
                       (results['rowid']+1, right_rowid))
        result_obj["right"] = []
        for i in cursor:
            result_obj["right"].append(i['philo_name'])
        result_obj["right"] = ' '.join(result_obj["right"])

        for metadata in config.kwic_metadata_sorting_fields:
            result_obj[metadata] = hit[metadata].lower()

        kwic_words.append(result_obj)

        index += 1

        elapsed = timeit.default_timer() - start_time
        if elapsed > max_time:  # avoid timeouts by splitting the query if more than 10 seconds has been spent in the loop
            break

    yield json.dumps({"results": kwic_words, "hits_done": index}).encode('utf8')