Beispiel #1
0
def concordance_results(request, config):
    """Fetch concordances results."""
    db = DB(config.db_path + '/data/')
    if request.collocation_type:
        first_hits = db.query(request["q"], request["method"], request["arg"],
                              **request.metadata)
        second_hits = db.query(request["left"], request["method"],
                               request["arg"], **request.metadata)
        hits = CombinedHitlist(first_hits, second_hits)
    else:
        hits = db.query(request["q"],
                        request["method"],
                        request["arg"],
                        sort_order=request["sort_order"],
                        **request.metadata)
    start, end, page_num = page_interval(request['results_per_page'], hits,
                                         request.start, request.end)

    concordance_object = {
        "description": {
            "start": start,
            "end": end,
            "results_per_page": request.results_per_page
        },
        "query": dict([i for i in request]),
        "default_object": db.locals['default_object_level']
    }

    formatting_regexes = []
    if config.concordance_formatting_regex:
        for pattern, replacement in config.concordance_formatting_regex:
            compiled_regex = re.compile(r'%s' % pattern)
            formatting_regexes.append((compiled_regex, replacement))
    results = []
    for hit in hits[start - 1:end]:
        citation_hrefs = citation_links(db, config, hit)
        metadata_fields = {}
        for metadata in db.locals['metadata_fields']:
            metadata_fields[metadata] = hit[metadata]
        citation = citations(hit, citation_hrefs, config, report="concordance")
        context = get_concordance_text(db, hit, config.db_path,
                                       config.concordance_length)
        if formatting_regexes:
            for formatting_regex, replacement in formatting_regexes:
                context = formatting_regex.sub(r'%s' % replacement, context)
        result_obj = {
            "philo_id": hit.philo_id,
            "citation": citation,
            "citation_links": citation_hrefs,
            "context": context,
            "metadata_fields": metadata_fields,
            "bytes": hit.bytes
        }
        results.append(result_obj)
    concordance_object["results"] = results
    concordance_object['results_length'] = len(hits)
    concordance_object["query_done"] = hits.done
    return concordance_object
Beispiel #2
0
def concordance_results(request, config):
    """Fetch concordances results."""
    db = DB(config.db_path + '/data/')
    if request.collocation_type:
        first_hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
        second_hits = db.query(request["left"], request["method"], request["arg"], **request.metadata)
        hits = CombinedHitlist(first_hits, second_hits)
    else:
        hits = db.query(request["q"],
                        request["method"],
                        request["arg"],
                        sort_order=request["sort_order"],
                        **request.metadata)
    start, end, page_num = page_interval(request['results_per_page'], hits, request.start, request.end)

    concordance_object = {
        "description": {"start": start,
                        "end": end,
                        "results_per_page": request.results_per_page},
        "query": dict([i for i in request]),
        "default_object": db.locals['default_object_level']
    }

    formatting_regexes = []
    if config.concordance_formatting_regex:
        for pattern, replacement in config.concordance_formatting_regex:
            compiled_regex = re.compile(r'%s' % pattern)
            formatting_regexes.append((compiled_regex, replacement))
    results = []
    for hit in hits[start - 1:end]:
        citation_hrefs = citation_links(db, config, hit)
        metadata_fields = {}
        for metadata in db.locals['metadata_fields']:
            metadata_fields[metadata] = hit[metadata]
        citation = citations(hit, citation_hrefs, config, report="concordance")
        context = get_concordance_text(db, hit, config.db_path, config.concordance_length)
        if formatting_regexes:
            for formatting_regex, replacement in formatting_regexes:
                context = formatting_regex.sub(r'%s' % replacement, context)
        result_obj = {
            "philo_id": hit.philo_id,
            "citation": citation,
            "citation_links": citation_hrefs,
            "context": context,
            "metadata_fields": metadata_fields,
            "bytes": hit.bytes
        }
        results.append(result_obj)
    concordance_object["results"] = results
    concordance_object['results_length'] = len(hits)
    concordance_object["query_done"] = hits.done
    return concordance_object
def kwic_results(request, config):
    """Fetch KWIC results"""
    db = DB(config.db_path + '/data/')
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    start, end, n = page_interval(request.results_per_page, hits,
                                  request.start, request.end)
    kwic_object = {
        "description": {
            "start": start,
            "end": end,
            "results_per_page": request.results_per_page
        },
        "query": dict([i for i in request])
    }
    kwic_object['results'] = []

    for hit in hits[start - 1:end]:
        kwic_result = kwic_hit_object(hit, config, db)
        kwic_object['results'].append(kwic_result)

    kwic_object['results_length'] = len(hits)
    kwic_object["query_done"] = hits.done

    return kwic_object
Beispiel #4
0
def lookup_word_service(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    cursor = db.dbh.cursor()

    if request.report == "concordance":
        hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
        context_size = config['concordance_length'] * 3
        hit = hits[int(request.position)]
        bytes = hit.bytes
        hit_span = hit.bytes[-1] - hit.bytes[0]
        length = context_size + hit_span + context_size
        bytes, start_byte = adjust_bytes(bytes, length)
        end_byte = start_byte + length
        filename = hit.filename
        token = request.selected
    elif request.report == "navigation":

        token = request.selected
        philo_id = request.philo_id.split(" ")
        text_obj = db[philo_id]
        start_byte, end_byte = int(text_obj.start_byte), int(text_obj.end_byte)
        filename = text_obj.filename
#        print >> sys.stderr, "WORD LOOKUP FROM NAVIGATION", request.philo_id,request.selected, start_byte, end_byte, filename
    else:
        pass
#    print >> sys.stderr, "TOKEN", token, "BYTES: ", start_byte, end_byte, "FILENAME: ", filename, "POSITION", request.position
    token_n = 0
    yield lookup_word(db, cursor, token, token_n, start_byte, end_byte, filename)
def get_metadata_token_count(environ,start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),("Access-Control-Allow-Origin","*")]
    start_response(status,headers)
    config = f.WebConfig()
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(db, environ)
    results = ''
    frequencies = json.loads(environ['wsgi.input'].read())
    word_counts = []
    c = db.dbh.cursor()
    count = 0
    for label, m in frequencies.iteritems():
        args = []
        query_metadata = {}
        for metadata in m['metadata']:
            query_metadata[metadata] = m['metadata'][metadata].encode('utf-8')
        hits = db.query(**query_metadata)
        total_count = 0
        for hit in hits:
            total_count += int(hit['word_count'])
        try:
            frequencies[label]['count'] = round(float(m['count']) / total_count * 1000000, 3)
        except:
            count += 1
            frequencies[label]['count'] = 0    
        
    yield json.dumps(frequencies)
Beispiel #6
0
def get_frequency(environ,start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),("Access-Control-Allow-Origin","*")]
    start_response(status,headers)
    config = f.WebConfig()
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(db, environ)
    if request.q == '' and request.no_q:
        if request.no_metadata:
            hits = db.get_all(db.locals['default_object_level'])
        else:
            hits = db.query(**request.metadata)
    else:
        hits = db.query(request["q"],request["method"],request["arg"],**request.metadata)
    results = r.generate_frequency(hits, request, db, config)
    yield json.dumps(results)
Beispiel #7
0
def term_group(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    if not request["q"]:
        dump = json.dumps({"original_query": "", "term_groups": []})
    else:
        hits = db.query(request["q"], request["method"], request["arg"], sort_order=request["sort_order"], **request.metadata)
        parsed = parse_query(request.q)
        group = group_terms(parsed)
        all_groups = split_terms(group)
        term_groups = []
        for g in all_groups:
            term_group = ''
            not_started = False
            for kind, term in g:
                if kind == 'NOT':
                    if not_started is False:
                        not_started = True
                        term_group += ' NOT '
                elif kind == 'OR':
                    term_group += '|'
                elif kind == "TERM":
                    term_group += ' %s ' % term
                elif kind == "QUOTE":
                    term_group += ' %s ' % term
            term_group = term_group.strip()
            term_groups.append(term_group)
        dump = json.dumps({"term_groups": term_groups, "original_query": request.original_q})
    yield dump.encode('utf8')
Beispiel #8
0
def term_group(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = f.WebConfig()
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(db, environ)
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    parsed = parse_query(request.q)
    group = group_terms(parsed)
    all_groups = split_terms(group)
    term_groups = []
    for g in all_groups:
        term_group = ''
        not_started = False
        for kind, term in g:
            if kind == 'NOT':
                if not_started == False:
                    not_started = True
                    term_group += ' NOT '
            elif kind == 'OR':
                term_group += '|'
            elif kind == "TERM":
                term_group += ' %s ' % term
            elif kind == "QUOTE":
                term_group += ' %s ' % term
        term_group = term_group.strip()
        term_groups.append(term_group)
    yield json.dumps(term_groups)
Beispiel #9
0
def lookup_word_service(environ, start_response):
    status = "200 OK"
    headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = f.WebConfig()
    db = DB(config.db_path + "/data/")
    request = WSGIHandler(db, environ)
    cursor = db.dbh.cursor()

    if request.report == "concordance":
        hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
        context_size = config["concordance_length"] * 3
        hit = hits[int(request.position)]
        bytes = hit.bytes
        hit_span = hit.bytes[-1] - hit.bytes[0]
        length = context_size + hit_span + context_size
        bytes, byte_start = adjust_bytes(bytes, length)
        byte_end = byte_start + length
        filename = hit.filename
        token = request.selected
    elif request.report == "navigation":

        token = request.selected
        philo_id = request.philo_id.split(" ")
        text_obj = db[philo_id]
        byte_start, byte_end = int(text_obj.byte_start), int(text_obj.byte_end)
        filename = text_obj.filename
    #        print >> sys.stderr, "WORD LOOKUP FROM NAVIGATION", request.philo_id,request.selected, byte_start, byte_end, filename
    else:
        pass
    #    print >> sys.stderr, "TOKEN", token, "BYTES: ", byte_start, byte_end, "FILENAME: ", filename, "POSITION", request.position
    token_n = 0
    yield lookup_word(db, cursor, token, token_n, byte_start, byte_end, filename)
Beispiel #10
0
def lookup_word_service(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    cursor = db.dbh.cursor()

    if request.report == "concordance":
        hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
        context_size = config['concordance_length'] * 3
        hit = hits[int(request.position)]
        bytes = hit.bytes
        hit_span = hit.bytes[-1] - hit.bytes[0]
        length = context_size + hit_span + context_size
        bytes, start_byte = adjust_bytes(bytes, length)
        end_byte = start_byte + length
        filename = hit.filename
        token = request.selected
    elif request.report == "navigation":

        token = request.selected
        philo_id = request.philo_id.split(" ")
        text_obj = db[philo_id]
        start_byte, end_byte = int(text_obj.start_byte), int(text_obj.end_byte)
        filename = text_obj.filename
#        print >> sys.stderr, "WORD LOOKUP FROM NAVIGATION", request.philo_id,request.selected, start_byte, end_byte, filename
    else:
        pass
#    print >> sys.stderr, "TOKEN", token, "BYTES: ", start_byte, end_byte, "FILENAME: ", filename, "POSITION", request.position
    token_n = 0
    yield lookup_word(db, cursor, token, token_n, start_byte, end_byte, filename)
Beispiel #11
0
def bibliography_results(request, config):
    """Fetch bibliography results"""
    db = DB(config.db_path + '/data/')
    if request.no_metadata:
        hits = db.get_all(db.locals['default_object_level'], request["sort_order"])
    else:
        hits = db.query(sort_order=request["sort_order"], **request.metadata)
    if request.simple_bibliography == "all": # request from simple landing page report which gets all biblio in load order
        hits.finish()
        start = 1
        end = len(hits)
        page_num = end
    else:
        start, end, page_num = page_interval(request.results_per_page, hits, request.start, request.end)
    bibliography_object = {
        "description": {
            "start": start,
            "end": end,
            "n": page_num,
            "results_per_page": request.results_per_page
        },
        "query": dict([i for i in request]),
        "default_object": db.locals['default_object_level']
    }
    results = []
    result_type = "doc"
    for hit in hits[start - 1:end]:
        citation_hrefs = citation_links(db, config, hit)
        metadata_fields = {}
        for metadata in db.locals['metadata_fields']:
            metadata_fields[metadata] = hit[metadata]
        result_type = hit.object_type
        if request.simple_bibliography == "all":
            citation = citations(hit, citation_hrefs, config, report="simple_landing")
        else:
            citation = citations(hit, citation_hrefs, config, report="bibliography", result_type=result_type)
        if config.dictionary_bibliography is False or result_type == "doc":
            results.append({
                'citation': citation,
                'citation_links': citation_hrefs,
                'philo_id': hit.philo_id,
                "metadata_fields": metadata_fields,
                "object_type": result_type
            })
        else:
            context = get_text_obj(hit, config, request, db.locals["token_regex"], images=False)
            results.append({
                'citation': citation,
                'citation_links': citation_hrefs,
                'philo_id': hit.philo_id,
                "metadata_fields": metadata_fields,
                "context": context,
                "object_type": result_type
            })
    bibliography_object["results"] = results
    bibliography_object['results_length'] = len(hits)
    bibliography_object['query_done'] = hits.done
    bibliography_object['result_type'] = result_type
    return bibliography_object, hits
def get_frequency(environ,start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),("Access-Control-Allow-Origin","*")]
    start_response(status,headers)
    config = f.WebConfig()
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(db, environ)
    hits = db.query(request["q"],request["method"],request["arg"],**request.metadata)
    field, word_frequency_object = generate_word_frequency(hits,request,db,config)
    yield json.dumps(word_frequency_object, indent=2)
Beispiel #13
0
def collocation(environ,start_response):
    config = f.WebConfig()
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(db, environ)
    headers = [('Content-type', 'application/json; charset=UTF-8'),("Access-Control-Allow-Origin","*")]
    start_response('200 OK',headers)
    hits = db.query(request["q"],"cooc",request["arg"],**request.metadata)
    hits.finish()
    collocation_object = fetch_collocation(hits, request, db, config)
    yield json.dumps(collocation_object)
def word_property_filter(environ,start_response):
    config = f.WebConfig()
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(db, environ)
    headers = [('Content-type', 'application/json; charset=UTF-8'),("Access-Control-Allow-Origin","*")]
    start_response('200 OK',headers)
    
    hits = db.query(request["q"],request["method"],request["arg"],**request.metadata)
    filter_results = filter_words_by_property(hits, config.db_path, request, db, config)
    yield json.dumps(filter_results)
Beispiel #15
0
def term_list(environ, start_response):
    status = "200 OK"
    headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = f.WebConfig()
    db = DB(config.db_path + "/data/")
    q = WSGIHandler(db, environ)
    hits = db.query(q["q"], q["method"], q["arg"], **q.metadata)
    expanded_terms = get_expanded_query(hits)
    yield json.dumps(expanded_terms[0])
Beispiel #16
0
def generate_word_frequency(request, config):
    """reads through a hitlist. looks up request["field"] in each hit, and builds up a list of
       unique values and their frequencies."""
    db = DB(config.db_path + "/data/")
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    field = request["field"]
    counts = {}
    frequency_object = {}
    start_time = timeit.default_timer()
    last_hit_done = request.start
    try:
        for n in hits[request.start:]:
            key = get_word_attrib(n, field, db)
            if not key:
                # NULL is a magic value for queries, don't change it
                # recklessly.
                key = "NULL"
            if key not in counts:
                counts[key] = 0
            counts[key] += 1
            elapsed = timeit.default_timer() - start_time
            last_hit_done += 1
            if elapsed > 5:
                break

        table = {}
        for k, v in counts.items():
            url = make_absolute_query_link(
                config,
                request,
                start="0",
                end="0",
                report="word_property_filter",
                word_property=field,
                word_property_value=k,
            )
            table[k] = {"count": v, "url": url}

        frequency_object["results"] = table
        frequency_object["hits_done"] = last_hit_done
        if last_hit_done == len(hits):
            frequency_object["more_results"] = False
        else:
            frequency_object["more_results"] = True

    except IndexError:
        frequency_object["results"] = {}
        frequency_object["more_results"] = False

    frequency_object["results_length"] = len(hits)
    frequency_object["query"] = dict([i for i in request])

    return frequency_object
Beispiel #17
0
def get_total_results(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    if request.no_q:
        if request.no_metadata:
            hits = db.get_all(db.locals['default_object_level'])
        else:
            hits = db.query(**request.metadata)
    else:
        hits = db.query(request["q"], request["method"], request["arg"],
                        **request.metadata)
    total_results = 0
    hits.finish()
    total_results = len(hits)

    yield simplejson.dumps(total_results)
Beispiel #18
0
def get_total_results(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    if request.no_q:
        if request.no_metadata:
            hits = db.get_all(db.locals['default_object_level'], request["sort_order"])
        else:
            hits = db.query(sort_order=request["sort_order"], **request.metadata)
    else:
        hits = db.query(request["q"], request["method"], request["arg"],
                        **request.metadata)
    total_results = 0
    hits.finish()
    total_results = len(hits)

    yield simplejson.dumps(total_results)
def get_more_context(environ, start_response):
    status = "200 OK"
    headers = [("Content-type", "application/json; charset=UTF-8"), ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = f.WebConfig()
    db = DB(config.db_path + "/data/")
    request = WSGIHandler(db, environ)
    hit_num = int(request.hit_num)
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    context_size = config["concordance_length"] * 3
    hit_context = r.fetch_concordance(db, hits[hit_num], config.db_path, context_size)
    yield json.dumps(hit_context)
Beispiel #20
0
def generate_word_frequency(request, config):
    """reads through a hitlist. looks up request["field"] in each hit, and builds up a list of
       unique values and their frequencies."""
    db = DB(config.db_path + '/data/')
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    field = request["field"]
    counts = {}
    frequency_object = {}
    more_results = True
    start_time = timeit.default_timer()
    last_hit_done = request.start
    try:
        for n in hits[request.start:]:
            key = get_word_attrib(n, field, db)
            if not key:
                # NULL is a magic value for queries, don't change it
                # recklessly.
                key = "NULL"
            if key not in counts:
                counts[key] = 0
            counts[key] += 1
            elapsed = timeit.default_timer() - start_time
            last_hit_done += 1
            if elapsed > 5:
                break

        table = {}
        for k, v in counts.iteritems():
            url = make_absolute_query_link(config,
                                           request,
                                           start="0",
                                           end="0",
                                           report="word_property_filter",
                                           word_property=field,
                                           word_property_value=k)
            table[k] = {'count': v, 'url': url}

        frequency_object['results'] = table
        frequency_object["hits_done"] = last_hit_done
        if last_hit_done == len(hits):
            frequency_object['more_results'] = False
        else:
            frequency_object['more_results'] = True

    except IndexError:
        frequency_object['results'] = {}
        frequency_object['more_results'] = False

    frequency_object['results_length'] = len(hits)
    frequency_object['query'] = dict([i for i in request])

    return frequency_object
Beispiel #21
0
def term_list(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    hits.finish()
    expanded_terms = get_expanded_query(hits)
    yield simplejson.dumps(expanded_terms[0])
Beispiel #22
0
def term_list(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = f.WebConfig()
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(db, environ)
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    hits.finish()
    expanded_terms = get_expanded_query(hits)
    yield json.dumps(expanded_terms[0])
Beispiel #23
0
def term_list(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    hits.finish()
    expanded_terms = get_expanded_query(hits)
    yield json.dumps(expanded_terms[0]).encode('utf8')
Beispiel #24
0
def get_more_context(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    hit_num = int(request.hit_num)
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    context_size = config['concordance_length'] * 3
    hit_context = get_concordance_text(db, hits[hit_num], config.db_path,
                                       context_size)
    yield json.dumps(hit_context).encode('utf8')
def get_frequency(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = f.WebConfig()
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(db, environ)
    setattr(request, 'frequency_field', json.dumps(
        eval('"%s"' % request.frequency_field)))
    if request.q == '' and request.no_q:
        if request.no_metadata:
            hits = db.get_all(db.locals['default_object_level'])
        else:
            hits = db.query(**request.metadata)
    else:
        hits = db.query(request["q"], request["method"], request["arg"],
                        **request.metadata)
    hits.finish()
    results = r.generate_frequency(hits, request, db, config)
    results['results'] = sorted(results['results'].iteritems(),
                                key=lambda x: x[1]['count'],
                                reverse=True)
    yield json.dumps(results)
Beispiel #26
0
def get_more_context(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    hit_num = int(request.hit_num)
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    context_size = config['concordance_length'] * 3
    hit_context = get_concordance_text(db, hits[hit_num], config.db_path,
                                       context_size)
    yield simplejson.dumps(hit_context)
def term_group(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)
    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)
    if not request["q"]:
        dump = json.dumps({"original_query": "", "term_groups": []})
    else:
        hits = db.query(request["q"],
                        request["method"],
                        request["arg"],
                        sort_order=request["sort_order"],
                        **request.metadata)
        parsed = parse_query(request.q)
        group = group_terms(parsed)
        all_groups = split_terms(group)
        term_groups = []
        for g in all_groups:
            term_group = ''
            not_started = False
            for kind, term in g:
                if kind == 'NOT':
                    if not_started is False:
                        not_started = True
                        term_group += ' NOT '
                elif kind == 'OR':
                    term_group += '|'
                elif kind == "TERM":
                    term_group += ' %s ' % term
                elif kind == "QUOTE":
                    term_group += ' %s ' % term
            term_group = term_group.strip()
            term_groups.append(term_group)
        dump = json.dumps({
            "term_groups": term_groups,
            "original_query": request.original_q
        })
    yield dump.encode('utf8')
Beispiel #28
0
def kwic_results(request, config):
    """Fetch KWIC results"""
    db = DB(config.db_path + '/data/')
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    start, end, n = page_interval(request.results_per_page, hits, request.start, request.end)
    kwic_object = {
        "description": {"start": start,
                        "end": end,
                        "results_per_page": request.results_per_page},
        "query": dict([i for i in request])
    }
    kwic_object['results'] = []

    for hit in hits[start - 1:end]:
        kwic_result = kwic_hit_object(hit, config, db)
        kwic_object['results'].append(kwic_result)

    kwic_object['results_length'] = len(hits)
    kwic_object["query_done"] = hits.done

    return kwic_object
Beispiel #29
0
def bibliography_results(request, config):
    db = DB(config.db_path + '/data/')
    if request.no_metadata:
        hits = db.get_all(db.locals['default_object_level'], request["sort_order"],)
    else:
        hits = db.query(**request.metadata)
    start, end, n = page_interval(request.results_per_page, hits, request.start, request.end)
    bibliography_object = {
        "description": {
            "start": start,
            "end": end,
            "n": n,
            "results_per_page": request.results_per_page
        },
        "query": dict([i for i in request]),
        "default_object": db.locals['default_object_level']
    }
    results = []
    result_type = 'doc'
    for hit in hits[start - 1:end]:
        citation_hrefs = citation_links(db, config, hit)
        metadata_fields = {}
        for metadata in db.locals['metadata_fields']:
            metadata_fields[metadata] = hit[metadata]
        result_type = hit.type
        if hit.type == "doc":
            citation = citations(hit, citation_hrefs, config, report="bibliography")
        else:
            citation = citations(hit, citation_hrefs, config, report="concordance")
        results.append({
            'citation': citation,
            'citation_links': citation_hrefs,
            'philo_id': hit.philo_id,
            "metadata_fields": metadata_fields
        })
    bibliography_object["results"] = results
    bibliography_object['results_length'] = len(hits)
    bibliography_object['query_done'] = hits.done
    bibliography_object['result_type'] = result_type
    return bibliography_object, hits
Beispiel #30
0
def generate_time_series(request, config):
    db = DB(config.db_path + '/data/')
    time_series_object = {'query': dict([i for i in request]), 'query_done': False}

    # Invalid date range
    if request.start_date == 'invalid' or request.end_date == 'invalid':
        time_series_object['results_length'] = 0
        time_series_object['more_results'] = False
        time_series_object['new_start_date'] = 0
        time_series_object['results'] = {'absolute_count': {}, 'date_count': {}}
        return time_series_object

    start_date, end_date = get_start_end_date(db,
                                              config,
                                              start_date=request.start_date or None,
                                              end_date=request.end_date or None)

    # Generate date ranges
    interval = int(request.year_interval)
    date_ranges = []
    # Make sure last date gets included in for loop below by adding one to last step
    for start in range(start_date, end_date+1, interval):
        end = start + interval - 1
        if end > end_date:
            end = end_date
        date_range = "%d-%d" % (start, end)
        date_ranges.append((start, date_range))

    absolute_count = defaultdict(int)
    date_counts = {}
    total_hits = 0
    last_date_done = start_date
    start_time = timeit.default_timer()
    max_time = request.max_time or 10
    for start_range, date_range in date_ranges:
        request.metadata[config.time_series_year_field] = date_range
        hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata)
        hits.finish()
        hit_len = len(hits)
        params = {"report": "concordance", "start": "0", "end": "0"}
        params[config.time_series_year_field] = date_range
        url = make_absolute_query_link(config, request, **params)
        absolute_count[start_range] = {"label": start_range, "count": hit_len, "url": url}

        # Get date total count
        if interval != '1':
            end_range = start_range + (int(request['year_interval']) - 1)
            query = 'select sum(word_count) from toms where %s between "%d" and "%d"' % (config.time_series_year_field,
                                                                                         start_range, end_range)
        else:
            query = "select sum(word_count) from toms where %s='%s'" % (config.time_series_year_field, start_range)

        cursor = db.dbh.cursor()
        cursor.execute(query)
        date_counts[start_range] = cursor.fetchone()[0] or 0
        total_hits += hit_len
        elapsed = timeit.default_timer() - start_time
        last_date_done = start_range
        # avoid timeouts by splitting the query if more than request.max_time
        # (in seconds) has been spent in the loop
        if elapsed > int(max_time):
            break

    time_series_object['results_length'] = total_hits
    if (last_date_done + int(request.year_interval)) >= end_date:
        time_series_object['more_results'] = False
    else:
        time_series_object['more_results'] = True
        time_series_object['new_start_date'] = last_date_done + int(request.year_interval)
    time_series_object['results'] = {'absolute_count': absolute_count, 'date_count': date_counts}

    return time_series_object
def get_neighboring_words(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'), ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)

    config = WebConfig(os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)

    try:
        index = int(request.hits_done)
    except:
        index = 0

    max_time = int(request.max_time)

    kwic_words = []
    start_time = timeit.default_timer()
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    c = db.dbh.cursor()

    for hit in hits[index:]:
        word_id = ' '.join([str(i) for i in hit.philo_id])
        query = 'select rowid, philo_name, parent from words where philo_id="%s" limit 1' % word_id
        c.execute(query)
        results = c.fetchone()

        parent_sentence = results['parent']

        highlighted_text = kwic_hit_object(hit, config, db)["highlighted_text"]
        highlighted_text = highlighted_text.translate(remove_punctuation_map)
        highlighted_text = highlighted_text.strip()

        result_obj = {
            "left": "",
            "right": "",
            "index": index,
            "q": highlighted_text
        }

        left_rowid = results["rowid"] - 10
        right_rowid = results["rowid"] + 10

        c.execute('select philo_name, philo_id from words where rowid between ? and ?',
                  (left_rowid, results['rowid']-1))
        result_obj["left"] = []
        for i in c.fetchall():
            result_obj["left"].append(i['philo_name'].decode('utf-8'))
        result_obj["left"].reverse()
        result_obj["left"] = ' '.join(result_obj["left"])

        c.execute('select philo_name, philo_id from words where rowid between ? and ?',
                  (results['rowid']+1, right_rowid))
        result_obj["right"] = []
        for i in c.fetchall():
            result_obj["right"].append(i['philo_name'].decode('utf-8'))
        result_obj["right"] = ' '.join(result_obj["right"])

        metadata_fields = {}
        for metadata in config.kwic_metadata_sorting_fields:
            result_obj[metadata] = hit[metadata].lower()

        kwic_words.append(result_obj)

        index += 1

        elapsed = timeit.default_timer() - start_time
        if elapsed > max_time:  # avoid timeouts by splitting the query if more than 10 seconds has been spent in the loop
            break

    yield simplejson.dumps({"results": kwic_words, "hits_done": index})
def filter_words_by_property(request, config):
    """Filter words by property"""
    db = DB(config.db_path + '/data/')
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    concordance_object = {"query": dict([i for i in request])}

    # Do these need to be captured in wsgi_handler?
    word_property = request["word_property"]
    word_property_value = request["word_property_value"]
    word_property_total = request["word_property_total"]

    new_hitlist = []
    results = []
    position = 0
    more_pages = False

    if request.start == 0:
        start = 1
    else:
        start = request.start

    for hit in hits:
        # get my chunk of text
        hit_val = get_word_attrib(hit, word_property, db)

        if hit_val == word_property_value:
            position += 1
            if position < start:
                continue
            new_hitlist.append(hit)
            citation_hrefs = citation_links(db, config, hit)
            metadata_fields = {}
            for metadata in db.locals['metadata_fields']:
                metadata_fields[metadata] = hit[metadata]
            citation = citations(hit, citation_hrefs, config)
            context = get_concordance_text(db, hit, config.db_path, config.concordance_length)
            result_obj = {
                "philo_id": hit.philo_id,
                "citation": citation,
                "citation_links": citation_hrefs,
                "context": context,
                "metadata_fields": metadata_fields,
                "bytes": hit.bytes,
                "collocate_count": 1
            }
            results.append(result_obj)

        if len(new_hitlist) == (request.results_per_page):
            more_pages = True
            break

    end = start + len(results) - 1
    if len(results) < request.results_per_page:
        word_property_total = end
    else:
        word_property_total = end + 1
    concordance_object['results'] = results
    concordance_object["query_done"] = hits.done
    concordance_object['results_length'] = word_property_total
    concordance_object["description"] = {
        "start": start,
        "end": end,
        "results_per_page": request.results_per_page,
        "more_pages": more_pages
    }
    return concordance_object
Beispiel #33
0
def generate_time_series(request, config):
    db = DB(config.db_path + '/data/')
    time_series_object = {'query': dict([i for i in request]), 'query_done': False}

    # Invalid date range
    if request.start_date == 'invalid' or request.end_date == 'invalid':
        time_series_object['results_length'] = 0
        time_series_object['more_results'] = False
        time_series_object['new_start_date'] = 0
        time_series_object['results'] = {'absolute_count': {}, 'date_count': {}}
        return time_series_object

    start_date, end_date = get_start_end_date(db,
                                              config,
                                              start_date=request.start_date or None,
                                              end_date=request.end_date or None)

    # Generate date ranges
    interval = int(request.year_interval)
    date_ranges = []
    # Make sure last date gets included in for loop below by adding one to last step
    for start in range(start_date, end_date+1, interval):
        end = start + interval - 1
        if end > end_date:
            end = end_date
        date_range = "%d-%d" % (start, end)
        date_ranges.append((start, date_range))

    absolute_count = defaultdict(int)
    date_counts = {}
    total_hits = 0
    last_date_done = start_date
    start_time = timeit.default_timer()
    max_time = request.max_time or 10
    for start_range, date_range in date_ranges:
        request.metadata[config.time_series_year_field] = date_range
        hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata)
        hits.finish()
        hit_len = len(hits)
        params = {"report": "concordance", "start": "0", "end": "0"}
        params[config.time_series_year_field] = date_range
        url = make_absolute_query_link(config, request, **params)
        absolute_count[start_range] = {"label": start_range, "count": hit_len, "url": url}

        # Get date total count
        if interval != '1':
            end_range = start_range + (int(request['year_interval']) - 1)
            query = 'select sum(word_count) from toms where %s between "%d" and "%d"' % (config.time_series_year_field,
                                                                                         start_range, end_range)
        else:
            query = "select sum(word_count) from toms where %s='%s'" % (config.time_series_year_field, start_range)

        c = db.dbh.cursor()
        c.execute(query)
        date_counts[start_range] = c.fetchone()[0] or 0
        total_hits += hit_len
        elapsed = timeit.default_timer() - start_time
        last_date_done = start_range
        # avoid timeouts by splitting the query if more than request.max_time
        # (in seconds) has been spent in the loop
        if elapsed > int(max_time):
            break

    time_series_object['results_length'] = total_hits
    if (last_date_done + int(request.year_interval)) >= end_date:
        time_series_object['more_results'] = False
    else:
        time_series_object['more_results'] = True
        time_series_object['new_start_date'] = last_date_done + int(request.year_interval)
    time_series_object['results'] = {'absolute_count': absolute_count, 'date_count': date_counts}

    return time_series_object
def collocation_results(request, config):
    """Fetch collocation results"""
    db = DB(config.db_path + '/data/')
    if request["collocate_distance"]:
        hits = db.query(request["q"], "proxy",
                        int(request['collocate_distance']), **request.metadata)
    else:
        hits = db.query(request["q"], "cooc", request["arg"],
                        **request.metadata)
    hits.finish()
    collocation_object = {"query": dict([i for i in request])}

    try:
        collocate_distance = int(request['collocate_distance'])
    except ValueError:  # Getting an empty string since the keyword is not specificed in the URL
        collocate_distance = None

    if request.colloc_filter_choice == "nofilter":
        filter_list = []
    else:
        filter_list = build_filter_list(request, config)
    collocation_object['filter_list'] = filter_list
    filter_list = set(filter_list)

    # Build list of search terms to filter out
    query_words = []
    for group in get_expanded_query(hits):
        for word in group:
            word = word.replace('"', '')
            query_words.append(word)
    query_words = set(query_words)
    filter_list = filter_list.union(query_words)

    if request["collocate_distance"]:
        hits = db.query(request["q"],
                        "proxy",
                        int(request['collocate_distance']),
                        raw_results=True,
                        **request.metadata)
    else:
        hits = db.query(request["q"],
                        "cooc",
                        request["arg"],
                        raw_results=True,
                        **request.metadata)
    hits.finish()

    stored_sentence_id = None
    stored_sentence_counts = defaultdict(int)
    sentence_hit_count = 1
    hits_done = request.start or 0
    max_time = request.max_time or 10
    all_collocates = defaultdict(lambda: {'count': 0})
    cursor = db.dbh.cursor()
    start_time = timeit.default_timer()
    try:
        for hit in hits[hits_done:]:
            word_id = ' '.join([str(i) for i in hit[:6]]) + ' ' + str(hit[7])
            query = """select parent, rowid from words where philo_id='%s' limit 1""" % word_id
            cursor.execute(query)
            result = cursor.fetchone()
            parent = result['parent']
            if parent != stored_sentence_id:
                rowid = int(result['rowid'])
                sentence_hit_count = 1
                stored_sentence_id = parent
                stored_sentence_counts = defaultdict(int)
                if collocate_distance:
                    begin_rowid = rowid - collocate_distance
                    if begin_rowid < 0:
                        begin_rowid = 0
                    end_rowid = rowid + collocate_distance
                    row_query = """select philo_name from words where parent='%s' and rowid between %d and %d""" % (
                        parent, begin_rowid, end_rowid)
                else:
                    row_query = """select philo_name from words where parent='%s'""" % (
                        parent, )
                cursor.execute(row_query)
                for i in cursor.fetchall():
                    collocate = i["philo_name"]
                    if collocate not in filter_list:
                        stored_sentence_counts[collocate] += 1
            else:
                sentence_hit_count += 1
            for word in stored_sentence_counts:
                if stored_sentence_counts[word] < sentence_hit_count:
                    continue
                all_collocates[word]['count'] += 1
            hits_done += 1
            elapsed = timeit.default_timer() - start_time
            # avoid timeouts by splitting the query if more than request.max_time (in
            # seconds) has been spent in the loop
            if elapsed > int(max_time):
                break
    except IndexError:
        collocation_object['hits_done'] = len(hits)

    collocation_object['collocates'] = all_collocates
    collocation_object["results_length"] = len(hits)
    if hits_done < collocation_object["results_length"]:
        collocation_object['more_results'] = True
        collocation_object['hits_done'] = hits_done
    else:
        collocation_object['more_results'] = False
        collocation_object['hits_done'] = collocation_object["results_length"]

    return collocation_object
def get_neighboring_words(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)

    config = f.WebConfig()
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(db, environ)

    try:
        index = int(request.hits_done)
    except:
        index = 0

    max_time = int(request.max_time)

    kwic_words = []
    start_time = timeit.default_timer()
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    c = db.dbh.cursor()

    for hit in hits[index:]:
        word_id = ' '.join([str(i) for i in hit.philo_id])
        query = 'select rowid, philo_name, parent from words where philo_id="%s" limit 1' % word_id
        c.execute(query)
        results = c.fetchone()

        parent_sentence = results['parent']

        if request.direction == "left":
            c.execute(
                'select philo_name, philo_id from words where parent=? and rowid < ?',
                (parent_sentence, results['rowid']))
            string = []
            for i in c.fetchall():
                string.append(i['philo_name'].decode('utf-8'))
            string.reverse()
            string = ' '.join(string)
        elif request.direction == "right":
            c.execute(
                'select philo_name, philo_id from words where parent=? and rowid > ?',
                (parent_sentence, results['rowid']))
            string = []
            for i in c.fetchall():
                string.append(i['philo_name'].decode('utf-8'))
            string = ' '.join(string)
        else:
            string = ""

        metadata_fields = {}
        for metadata in config.kwic_metadata_sorting_fields:
            metadata_fields[metadata] = hit[metadata].lower()

        kwic_words.append((string, index, metadata_fields))

        index += 1

        elapsed = timeit.default_timer() - start_time
        if elapsed > max_time:  # avoid timeouts by splitting the query if more than 10 seconds has been spent in the loop
            break

    yield json.dumps({"results": kwic_words, "hits_done": index})
Beispiel #36
0
def collocation_results(request, config):
    """Fetch collocation results"""
    db = DB(config.db_path + '/data/')
    if request["collocate_distance"]:
        hits = db.query(request["q"], "proxy", int(request['collocate_distance']), **request.metadata)
    else:
        hits = db.query(request["q"], "cooc", request["arg"], **request.metadata)
    hits.finish()
    collocation_object = {"query": dict([i for i in request])}

    try:
        collocate_distance = int(request['collocate_distance'])
    except ValueError:  # Getting an empty string since the keyword is not specificed in the URL
        collocate_distance = None

    if request.colloc_filter_choice == "nofilter":
        filter_list = []
    else:
        filter_list = build_filter_list(request, config)
    collocation_object['filter_list'] = filter_list
    filter_list = set(filter_list)

    # Build list of search terms to filter out
    query_words = []
    for group in get_expanded_query(hits):
        for word in group:
            word = word.replace('"', '')
            query_words.append(word)
    query_words = set(query_words)
    filter_list = filter_list.union(query_words)

    if request["collocate_distance"]:
        hits = db.query(request["q"], "proxy", int(request['collocate_distance']), raw_results=True, **request.metadata)
    else:
        hits = db.query(request["q"], "cooc", request["arg"], raw_results=True, **request.metadata)
    hits.finish()

    stored_sentence_id = None
    stored_sentence_counts = defaultdict(int)
    sentence_hit_count = 1
    hits_done = request.start or 0
    max_time = request.max_time or 10
    all_collocates = defaultdict(lambda: {'count': 0})
    cursor = db.dbh.cursor()
    start_time = timeit.default_timer()
    try:
        for hit in hits[hits_done:]:
            word_id = ' '.join([str(i) for i in hit[:6]]) + ' ' + str(hit[7])
            query = """select parent, rowid from words where philo_id='%s' limit 1""" % word_id
            cursor.execute(query)
            result = cursor.fetchone()
            parent = result['parent']
            if parent != stored_sentence_id:
                rowid = int(result['rowid'])
                sentence_hit_count = 1
                stored_sentence_id = parent
                stored_sentence_counts = defaultdict(int)
                if collocate_distance:
                    begin_rowid = rowid - collocate_distance
                    if begin_rowid < 0:
                        begin_rowid = 0
                    end_rowid = rowid + collocate_distance
                    row_query = """select philo_name from words where parent='%s' and rowid between %d and %d""" % (
                        parent, begin_rowid, end_rowid)
                else:
                    row_query = """select philo_name from words where parent='%s'""" % (parent, )
                cursor.execute(row_query)
                for i in cursor.fetchall():
                    collocate = i["philo_name"]
                    if collocate not in filter_list:
                        stored_sentence_counts[collocate] += 1
            else:
                sentence_hit_count += 1
            for word in stored_sentence_counts:
                if stored_sentence_counts[word] < sentence_hit_count:
                    continue
                all_collocates[word]['count'] += 1
            hits_done += 1
            elapsed = timeit.default_timer() - start_time
            # avoid timeouts by splitting the query if more than request.max_time (in
            # seconds) has been spent in the loop
            if elapsed > int(max_time):
                break
    except IndexError:
        collocation_object['hits_done'] = len(hits)

    collocation_object['collocates'] = all_collocates
    collocation_object["results_length"] = len(hits)
    if hits_done < collocation_object["results_length"]:
        collocation_object['more_results'] = True
        collocation_object['hits_done'] = hits_done
    else:
        collocation_object['more_results'] = False
        collocation_object['hits_done'] = collocation_object["results_length"]

    return collocation_object
Beispiel #37
0
def filter_words_by_property(request, config):
    db = DB(config.db_path + '/data/')
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    concordance_object = {"query": dict([i for i in request])}

    # Do these need to be captured in wsgi_handler?
    word_property = request["word_property"]
    word_property_value = request["word_property_value"]
    word_property_total = request["word_property_total"]

    new_hitlist = []
    results = []
    position = 0
    more_pages = False

    if request.start == 0:
        start = 1
    else:
        start = request.start

    for hit in hits:
        ## get my chunk of text ##
        hit_val = get_word_attrib(hit, word_property, db)

        if hit_val == word_property_value:
            position += 1
            if position < start:
                continue
            new_hitlist.append(hit)
            citation_hrefs = citation_links(db, config, hit)
            metadata_fields = {}
            for metadata in db.locals['metadata_fields']:
                metadata_fields[metadata] = hit[metadata]
            citation = citations(hit, citation_hrefs, config)
            context = fetch_concordance(db, hit, config.db_path, config.concordance_length)
            result_obj = {
                "philo_id": hit.philo_id,
                "citation": citation,
                "citation_links": citation_hrefs,
                "context": context,
                "metadata_fields": metadata_fields,
                "bytes": hit.bytes,
                "collocate_count": 1
            }
            results.append(result_obj)

        if len(new_hitlist) == (request.results_per_page):
            more_pages = True
            break

    end = start + len(results) - 1
    if len(results) < request.results_per_page:
        word_property_total = end
    else:
        word_property_total = end + 1
    concordance_object['results'] = results
    concordance_object["query_done"] = hits.done
    concordance_object['results_length'] = word_property_total
    concordance_object["description"] = {
        "start": start,
        "end": end,
        "results_per_page": request.results_per_page,
        "more_pages": more_pages
    }
    return concordance_object
Beispiel #38
0
def generate_time_series(request, config):
    db = DB(config.db_path + '/data/')
    time_series_object = {'query': dict([i for i in request]), 'query_done': False}

    start_date, end_date = get_start_end_date(db, config, start_date=None, end_date=None)

    # Generate date ranges
    interval = int(request.year_interval)
    date_ranges = []
    for i in xrange(start_date, end_date, interval):
        start = i
        end = i + interval - 1
        if end > end_date:
            end = end_date
        date_range = "%d-%d" % (start, end)
        date_ranges.append((start, date_range))

    absolute_count = defaultdict(int)
    date_counts = {}
    total_hits = 0
    last_date_done = start_date
    start_time = timeit.default_timer()
    max_time = request.max_time or 10
    for start_range, date_range in date_ranges:
        request.metadata[config.time_series_year_field] = date_range
        hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
        hits.finish()
        params = {report: "concordance", start: "0", end: "0"}
        params[config.time_series_year_field] = date_range
        url = make_absolute_query_link(config, request, **params)
        absolute_count[start_range] = {"label": start_range, "count": len(hits), "url": url}

        # Get date total count
        if interval != '1':
            dates = [start_range]
            end_range = start_range + (int(request['year_interval']) - 1)
            query = 'select sum(word_count) from toms where %s between "%d" and "%d"' % (config.time_series_year_field,
                                                                                         start_range, end_range)
        else:
            query = "select sum(word_count) from toms where %s='%s'" % (config.time_series_year_field, start_range)

        c = db.dbh.cursor()
        c.execute(query)
        date_counts[start_range] = c.fetchone()[0] or 0
        total_hits += len(hits)
        print >> sys.stderr, "TOTAL", total_hits
        elapsed = timeit.default_timer() - start_time
        # avoid timeouts by splitting the query if more than request.max_time (in seconds) has been spent in the loop
        if elapsed > int(max_time):
            last_date_done = start_range
            break
        last_date_done = start_range

    time_series_object['results_length'] = total_hits
    if (last_date_done + int(request.year_interval)) >= end_date:
        time_series_object['more_results'] = False
    else:
        time_series_object['more_results'] = True
        time_series_object['new_start_date'] = last_date_done + int(request.year_interval)
    time_series_object['results'] = {'absolute_count': absolute_count, 'date_count': date_counts}

    return time_series_object
Beispiel #39
0
def frequency_results(request, config, sorted_results=False):
    """reads through a hitlist. looks up request.frequency_field in each hit, and builds up a list of
       unique values and their frequencies."""
    db = DB(config.db_path + "/data/")
    biblio_search = False
    if request.q == "" and request.no_q:
        biblio_search = True
        if request.no_metadata:
            hits = db.get_all(db.locals["default_object_level"],
                              sort_order=["rowid"],
                              raw_results=True)
        else:
            hits = db.query(sort_order=["rowid"],
                            raw_results=True,
                            **request.metadata)
    else:
        hits = db.query(request["q"],
                        request["method"],
                        request["arg"],
                        raw_results=True,
                        **request.metadata)

    if sorted_results is True:
        hits.finish()

    cursor = db.dbh.cursor()

    cursor.execute("select philo_id, %s from toms where %s is not null" %
                   (request.frequency_field, request.frequency_field))
    metadata_dict = {}
    for i in cursor:
        philo_id, field = i
        philo_id = tuple(int(s) for s in philo_id.split() if int(s))
        metadata_dict[philo_id] = field

    counts = {}
    frequency_object = {}
    start_time = timeit.default_timer()
    last_hit_done = request.start

    obj_dict = {
        "doc": 1,
        "div1": 2,
        "div2": 3,
        "div3": 4,
        "para": 5,
        "sent": 6,
        "word": 7
    }
    metadata_type = db.locals["metadata_types"][request.frequency_field]
    try:
        object_level = obj_dict[metadata_type]
    except KeyError:
        # metadata_type == "div"
        pass

    try:
        for philo_id in hits[request.start:]:
            if not biblio_search:
                philo_id = tuple(list(philo_id[:6]) + [philo_id[7]])
            if metadata_type == "div":
                key = ""
                for div in ["div1", "div2", "div3"]:
                    if philo_id[:obj_dict[div]] in metadata_dict:
                        key = metadata_dict[philo_id[:obj_dict[div]]]
                while not key:
                    if philo_id[:4] in metadata_dict:
                        key = metadata_dict[philo_id[:4]]
                        break
                    if philo_id[:5] in metadata_dict:
                        key = metadata_dict[philo_id[:5]]
                        break
                    break
                if not key:
                    last_hit_done += 1
                    continue
            else:
                try:
                    key = metadata_dict[philo_id[:object_level]]
                except:
                    last_hit_done += 1
                    continue
            if key not in counts:
                counts[key] = {
                    "count": 0,
                    "metadata": {
                        request.frequency_field: key
                    }
                }
                counts[key]["url"] = make_absolute_query_link(
                    config,
                    request,
                    frequency_field="",
                    start="0",
                    end="0",
                    report=request.report,
                    script="",
                    **{request.frequency_field: '"%s"' % key})
                if not biblio_search:
                    query_metadata = dict([
                        (k, v) for k, v in request.metadata.items() if v
                    ])
                    query_metadata[request.frequency_field] = '"%s"' % key
                    local_hits = db.query(**query_metadata)
                    counts[key][
                        "total_word_count"] = local_hits.get_total_word_count(
                        )
            counts[key]["count"] += 1

            # avoid timeouts by splitting the query if more than
            # request.max_time (in seconds) has been spent in the loop
            elapsed = timeit.default_timer() - start_time
            last_hit_done += 1
            if elapsed > 5 and sorted_results is False:
                break

        frequency_object["results"] = counts
        frequency_object["hits_done"] = last_hit_done
        if last_hit_done == len(hits):
            new_metadata = dict([(k, v) for k, v in request.metadata.items()
                                 if v])
            new_metadata[request.frequency_field] = '"NULL"'
            if request.q == "" and request.no_q:
                new_hits = db.query(sort_order=["rowid"],
                                    raw_results=True,
                                    **new_metadata)
            else:
                new_hits = db.query(request["q"],
                                    request["method"],
                                    request["arg"],
                                    raw_results=True,
                                    **new_metadata)
            new_hits.finish()
            if len(new_hits):
                null_url = make_absolute_query_link(
                    config,
                    request,
                    frequency_field="",
                    start="0",
                    end="0",
                    report=request.report,
                    script="",
                    **{request.frequency_field: '"NULL"'})
                local_hits = db.query(**new_metadata)
                if not biblio_search:
                    frequency_object["results"]["NULL"] = {
                        "count": len(new_hits),
                        "url": null_url,
                        "metadata": {
                            request.frequency_field: '"NULL"'
                        },
                        "total_word_count": local_hits.get_total_word_count(),
                    }
                else:
                    frequency_object["results"]["NULL"] = {
                        "count": len(new_hits),
                        "url": null_url,
                        "metadata": {
                            request.frequency_field: '"NULL"'
                        },
                    }
            frequency_object["more_results"] = False
        else:
            frequency_object["more_results"] = True
    except IndexError:
        frequency_object["results"] = {}
        frequency_object["more_results"] = False
    frequency_object["results_length"] = len(hits)
    frequency_object["query"] = dict([i for i in request])

    if sorted_results is True:
        frequency_object["results"] = sorted(
            frequency_object["results"].items(),
            key=lambda x: x[1]["count"],
            reverse=True)

    return frequency_object
Beispiel #40
0
def frequency_results(request, config, sorted=False):
    """reads through a hitlist. looks up request.frequency_field in each hit, and builds up a list of
       unique values and their frequencies."""
    db = DB(config.db_path + '/data/')
    biblio_search = False
    if request.q == '' and request.no_q:
        biblio_search = True
        if request.no_metadata:
            hits = db.get_all(db.locals['default_object_level'], sort_order=["rowid"], raw_results=True)
        else:
            hits = db.query(sort_order=["rowid"], raw_results=True, **request.metadata)
    else:
        hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **request.metadata)

    if sorted:
        hits.finish()

    c = db.dbh.cursor()

    c.execute('select philo_id, %s from toms where %s is not null' % (request.frequency_field, request.frequency_field))
    metadata_dict = {}
    for i in c.fetchall():
        philo_id, field = i
        philo_id = tuple(int(s) for s in philo_id.split() if int(s))
        metadata_dict[philo_id] = field

    counts = {}
    frequency_object = {}
    start_time = timeit.default_timer()
    last_hit_done = request.start

    obj_dict = {'doc': 1, 'div1': 2, 'div2': 3, 'div3': 4, 'para': 5, 'sent': 6, 'word': 7}
    metadata_type = db.locals["metadata_types"][request.frequency_field]
    try:
        object_level = obj_dict[metadata_type]
    except KeyError:
        # metadata_type == "div"
        pass

    try:
        for philo_id in hits[request.start:]:
            if not biblio_search:
                philo_id = tuple(list(philo_id[:6]) + [philo_id[7]])
            if metadata_type == "div":
                key = ""
                for div in ["div1", "div2", "div3"]:
                    if philo_id[:obj_dict[div]] in metadata_dict:
                        key = metadata_dict[philo_id[:obj_dict[div]]]
                while not key:
                    if philo_id[:4] in metadata_dict:
                        key = metadata_dict[philo_id[:4]]
                        break
                    if philo_id[:5] in metadata_dict:
                        key = metadata_dict[philo_id[:5]]
                        break
                    break
                if not key:
                    last_hit_done += 1
                    continue
            else:
                try:
                    key = metadata_dict[philo_id[:object_level]]
                except:
                    last_hit_done += 1
                    continue
            if key not in counts:
                counts[key] = {"count": 0, 'metadata': {request.frequency_field: key}}
                counts[key]["url"] = make_absolute_query_link(config,
                                                              request,
                                                              frequency_field="",
                                                              start="0",
                                                              end="0",
                                                              report=request.report,
                                                              script='',
                                                              **{request.frequency_field: '"%s"' % key})
                if not biblio_search:
                    query_metadata = dict([(k, v) for k, v in request.metadata.iteritems() if v])
                    query_metadata[request.frequency_field] = '"%s"' % key
                    local_hits = db.query(**query_metadata)
                    counts[key]["total_word_count"] = local_hits.get_total_word_count()
            counts[key]["count"] += 1

            # avoid timeouts by splitting the query if more than
            # request.max_time (in seconds) has been spent in the loop
            elapsed = timeit.default_timer() - start_time
            last_hit_done += 1
            if elapsed > 5:
                break

        frequency_object['results'] = counts
        frequency_object["hits_done"] = last_hit_done
        if last_hit_done == len(hits):
            new_metadata = dict([(k, v) for k, v in request.metadata.iteritems() if v])
            new_metadata[request.frequency_field] = '"NULL"'
            if request.q == '' and request.no_q:
                new_hits = db.query(sort_order=["rowid"], raw_results=True, **new_metadata)
            else:
                new_hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **new_metadata)
            new_hits.finish()
            if len(new_hits):
                null_url = make_absolute_query_link(config,
                                                    request,
                                                    frequency_field="",
                                                    start="0",
                                                    end="0",
                                                    report=request.report,
                                                    script='',
                                                    **{request.frequency_field: '"NULL"'})
                local_hits = db.query(**new_metadata)
                if not biblio_search:
                    frequency_object["results"]["NULL"] = {"count": len(new_hits),
                                                           "url": null_url,
                                                           "metadata": {request.frequency_field: '"NULL"'},
                                                           "total_word_count": local_hits.get_total_word_count()}
                else:
                    frequency_object["results"]["NULL"] = {"count": len(new_hits),
                                                           "url": null_url,
                                                           "metadata": {request.frequency_field: '"NULL"'}}
            frequency_object['more_results'] = False
        else:
            frequency_object['more_results'] = True
    except IndexError:
        frequency_object['results'] = {}
        frequency_object['more_results'] = False
    frequency_object['results_length'] = len(hits)
    frequency_object['query'] = dict([i for i in request])

    if sorted:
        frequency_object["results"] = sorted(frequency_object['results'].iteritems(),
                                             key=lambda x: x[1]['count'],
                                             reverse=True)

    return frequency_object
Beispiel #41
0
def frequency_results(request, config, sorted=False):
    """reads through a hitlist. looks up request.frequency_field in each hit, and builds up a list of
       unique values and their frequencies."""
    db = DB(config.db_path + '/data/')
    if request.q == '' and request.no_q:
        if request.no_metadata:
            hits = db.get_all(db.locals['default_object_level'])
        else:
            hits = db.query(**request.metadata)
    else:
        hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)

    if sorted:
        hits.finish()

    field_list = eval(simplejson.loads(request.frequency_field))

    counts = defaultdict(int)
    frequency_object = {}
    start_time = timeit.default_timer()
    last_hit_done = request.start

    try:
        for hit in hits[request.start:]:
            key = tuple((field, hit[field]) for field in field_list)
            counts[key] += 1

            # avoid timeouts by splitting the query if more than request.max_time (in seconds) has been spent in the loop
            elapsed = timeit.default_timer() - start_time
            last_hit_done += 1
            if elapsed > 5:
                break

        table = {}
        for key, count in counts.iteritems():
            # for each item in the table, we modify the query params to
            # generate a link url.
            # Make a distinct copy for each key in case we modify it below
            metadata = dict(request.metadata)

            # Build a label starting with the first value as the main value
            first_metatada_key, first_metadata_value = key[0]
            label = first_metadata_value or "NULL"
            metadata[first_metatada_key] = first_metadata_value.encode('utf-8', 'ignore') or "NULL"
            append_to_label = []
            for metadata_key, metadata_value in key[1:]:
                metadata_value = metadata_value.strip()
                if not metadata_value:
                    # replace NULL with '[None]', 'N.A.', 'Untitled', etc.
                    metadata[metadata_key] = "NULL"
                else:
                    # we want to run exact queries on defined values.
                    metadata[metadata_key] = metadata_value.encode('utf-8', 'ignore')
                    append_to_label.append(metadata_value)
            # Add parentheses to other value, as they are secondary
            if append_to_label:
                label = label + ' (' + ', '.join(append_to_label) + ')'

            # Quote metadata to force exact matches on metadata
            for m in metadata:
                if m not in request.metadata:  # skip metadata already in original query: this could be a glob search
                    if metadata[m] and m != "date" and metadata[m] != "NULL":
                        if not metadata[m].startswith('"'):
                            metadata[m] = '"%s"' % metadata[m]
            # Now build the url from request.
            url = make_absolute_query_link(config,
                                           request,
                                           frequency_field="",
                                           start="0",
                                           end="0",
                                           report=request.report,
                                           script='',
                                           **metadata)
            table[label] = {'count': count, 'url': url, 'metadata': metadata}
        frequency_object['results'] = table
        frequency_object["hits_done"] = last_hit_done
        if last_hit_done == len(hits):
            frequency_object['more_results'] = False
        else:
            frequency_object['more_results'] = True
    except IndexError:
        frequency_object['results'] = {}
        frequency_object['more_results'] = False
    frequency_object['results_length'] = len(hits)
    frequency_object['query'] = dict([i for i in request])

    if sorted:
        frequency_object["results"] = sorted(frequency_object['results'].iteritems(),
                                             key=lambda x: x[1]['count'],
                                             reverse=True)

    return frequency_object
def get_neighboring_words(environ, start_response):
    status = '200 OK'
    headers = [('Content-type', 'application/json; charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*")]
    start_response(status, headers)

    config = WebConfig(
        os.path.abspath(os.path.dirname(__file__)).replace('scripts', ''))
    db = DB(config.db_path + '/data/')
    request = WSGIHandler(environ, config)

    try:
        index = int(request.hits_done)
    except:
        index = 0

    max_time = int(request.max_time)

    kwic_words = []
    start_time = timeit.default_timer()
    hits = db.query(request["q"], request["method"], request["arg"],
                    **request.metadata)
    c = db.dbh.cursor()

    for hit in hits[index:]:
        word_id = ' '.join([str(i) for i in hit.philo_id])
        query = 'select rowid, philo_name, parent from words where philo_id="%s" limit 1' % word_id
        c.execute(query)
        results = c.fetchone()

        parent_sentence = results['parent']

        highlighted_text = kwic_hit_object(hit, config, db)["highlighted_text"]
        highlighted_text = highlighted_text.translate(remove_punctuation_map)
        highlighted_text = highlighted_text.strip()

        result_obj = {
            "left": "",
            "right": "",
            "index": index,
            "q": highlighted_text
        }

        left_rowid = results["rowid"] - 10
        right_rowid = results["rowid"] + 10

        c.execute(
            'select philo_name, philo_id from words where rowid between ? and ?',
            (left_rowid, results['rowid'] - 1))
        result_obj["left"] = []
        for i in c.fetchall():
            result_obj["left"].append(i['philo_name'])
        result_obj["left"].reverse()
        result_obj["left"] = ' '.join(result_obj["left"])

        c.execute(
            'select philo_name, philo_id from words where rowid between ? and ?',
            (results['rowid'] + 1, right_rowid))
        result_obj["right"] = []
        for i in c.fetchall():
            result_obj["right"].append(i['philo_name'])
        result_obj["right"] = ' '.join(result_obj["right"])

        metadata_fields = {}
        for metadata in config.kwic_metadata_sorting_fields:
            result_obj[metadata] = hit[metadata].lower()

        kwic_words.append(result_obj)

        index += 1

        elapsed = timeit.default_timer() - start_time
        if elapsed > max_time:  # avoid timeouts by splitting the query if more than 10 seconds has been spent in the loop
            break

    yield json.dumps({
        "results": kwic_words,
        "hits_done": index
    }).encode('utf8')