Esempio n. 1
0
def generate_text_object(request, config, note=False):
    """Return text object given an philo_id"""
    # verify this isn't a page ID or if this is a note
    if len(request.philo_id.split()) == 9 and note is not True:
        width = 9
    else:
        width = 7
    db = DB(config.db_path + '/data/', width=width)
    if note:
        target = request.target.replace('#', '')
        doc_id = request.philo_id.split()[0] + ' %'
        c = db.dbh.cursor()
        c.execute(
            'select philo_id from toms where id=? and philo_id like ? limit 1',
            (target, doc_id))
        philo_id = c.fetchall()[0]['philo_id'].split()[:7]
        obj = db[philo_id]
    else:
        try:
            obj = db[request.philo_id]
        except ValueError:
            obj = db[' '.join(request.path_components)]
        philo_id = obj.philo_id
    if width != 9:
        while obj['philo_name'] == '__philo_virtual' and obj[
                "philo_type"] != "div1":
            philo_id.pop()
            obj = db[philo_id]
    philo_id = list(obj.philo_id)
    while int(philo_id[-1]) == 0:
        philo_id.pop()
    text_object = {
        "query": dict([i for i in request]),
        "philo_id": ' '.join([str(i) for i in philo_id])
    }
    text_object['prev'] = neighboring_object_id(db, obj.prev, width)
    text_object['next'] = neighboring_object_id(db, obj.__next__, width)
    metadata_fields = {}
    for metadata in db.locals['metadata_fields']:
        metadata_fields[metadata] = obj[metadata]
    text_object['metadata_fields'] = metadata_fields
    if width != 9:
        citation_hrefs = citation_links(db, config, obj)
    else:
        doc_obj = db[obj.philo_id[0]]
        citation_hrefs = citation_links(db, config, doc_obj)
    citation = citations(obj, citation_hrefs, config, report="navigation")
    text_object['citation'] = citation
    text, imgs = get_text_obj(obj,
                              config,
                              request,
                              db.locals["token_regex"],
                              note=note)
    if config.navigation_formatting_regex:
        for pattern, replacement in config.navigation_formatting_regex:
            text = re.sub(r'%s' % pattern, '%s' % replacement, text)
    text_object['text'] = text
    text_object['imgs'] = imgs
    return text_object
Esempio n. 2
0
def generate_text_object(request, config, note=False):
    """Return text object given an philo_id"""
    # verify this isn't a page ID or if this is a note
    if len(request.philo_id.split()) == 9 and note is not True:
        width = 9
    else:
        width = 7
    db = DB(config.db_path + '/data/', width=width)
    if note:
        target = request.target.replace('#', '')
        doc_id = request.philo_id.split()[0] + ' %'
        cursor = db.dbh.cursor()
        cursor.execute('select philo_id from toms where id=? and philo_id like ? limit 1', (target, doc_id))
        philo_id = cursor.fetchall()[0]['philo_id'].split()[:7]
        obj = db[philo_id]
    else:
        try:
            obj = db[request.philo_id]
        except ValueError:
            obj = db[' '.join(request.path_components)]
        philo_id = obj.philo_id
    if width != 9:
        while obj['philo_name'] == '__philo_virtual' and obj["philo_type"] != "div1":
            philo_id.pop()
            obj = db[philo_id]
    philo_id = list(obj.philo_id)
    while int(philo_id[-1]) == 0:
        philo_id.pop()
    text_object = {"query": dict([i for i in request]), "philo_id": ' '.join([str(i) for i in philo_id])}
    text_object['prev'] = neighboring_object_id(db, obj.prev, width)
    text_object['next'] = neighboring_object_id(db, obj.__next__, width)
    metadata_fields = {}
    for metadata in db.locals['metadata_fields']:
        metadata_fields[metadata] = obj[metadata]
    text_object['metadata_fields'] = metadata_fields
    if width != 9:
        citation_hrefs = citation_links(db, config, obj)
        citation = citations(obj, citation_hrefs, config, report="navigation")
    else:
        db = DB(config.db_path + '/data/', width=7)
        doc_obj = db[obj.philo_id[0]]
        citation_hrefs = citation_links(db, config, doc_obj)
        citation = citations(doc_obj, citation_hrefs, config, report="navigation")
    text_object['citation'] = citation
    text, imgs = get_text_obj(obj, config, request, db.locals["token_regex"], note=note)
    if config.navigation_formatting_regex:
        for pattern, replacement in config.navigation_formatting_regex:
            text = re.sub(r'%s' % pattern, '%s' % replacement, text)
    text_object['text'] = text
    text_object['imgs'] = imgs
    return text_object
Esempio n. 3
0
def group_by_metadata(request, config):
    citation_types = json.loads(request.citation)
    db = DB(config.db_path + "/data/")
    cursor = db.dbh.cursor()
    query = """select * from toms where philo_type="doc" and %s=?""" % request.group_by_field
    cursor.execute(query, (request.query, ))
    result_group = []
    for doc in cursor:
        obj = db[doc["philo_id"]]
        links = citation_links(db, config, obj)
        citation = citations(obj,
                             links,
                             config,
                             report="landing_page",
                             citation_type=citation_types)
        result_group.append({
            "metadata": get_all_metadata(db, doc),
            "citation": citation
        })
    return json.dumps({
        "display_count":
        request.display_count,
        "content_type":
        request.group_by_field,
        "content": [{
            "prefix": request.query,
            "results": result_group
        }],
    })
Esempio n. 4
0
def bibliography_results(request, config):
    """Fetch bibliography results"""
    db = DB(config.db_path + "/data/")
    if request.no_metadata:
        hits = db.get_all(db.locals["default_object_level"], request["sort_order"])
    else:
        hits = db.query(sort_order=request["sort_order"], **request.metadata)
    if (
        request.simple_bibliography == "all"
    ):  # request from simple landing page report which gets all biblio in load order
        hits.finish()
        start = 1
        end = len(hits)
        page_num = end
    else:
        start, end, page_num = page_interval(request.results_per_page, hits, request.start, request.end)
    bibliography_object = {
        "description": {"start": start, "end": end, "n": page_num, "results_per_page": request.results_per_page},
        "query": dict([i for i in request]),
        "default_object": db.locals["default_object_level"],
    }
    results = []
    result_type = "doc"
    for hit in hits[start - 1 : end]:
        citation_hrefs = citation_links(db, config, hit)
        metadata_fields = {}
        for metadata in db.locals["metadata_fields"]:
            metadata_fields[metadata] = hit[metadata]
        result_type = hit.object_type
        if request.simple_bibliography == "all":
            citation = citations(hit, citation_hrefs, config, report="simple_landing")
        else:
            citation = citations(hit, citation_hrefs, config, report="bibliography", result_type=result_type)
        if config.dictionary_bibliography is False or result_type == "doc":
            results.append(
                {
                    "citation": citation,
                    "citation_links": citation_hrefs,
                    "philo_id": hit.philo_id,
                    "metadata_fields": metadata_fields,
                    "object_type": result_type,
                }
            )
        else:
            context = get_text_obj(hit, config, request, db.locals["token_regex"], images=False)
            results.append(
                {
                    "citation": citation,
                    "citation_links": citation_hrefs,
                    "philo_id": hit.philo_id,
                    "metadata_fields": metadata_fields,
                    "context": context,
                    "object_type": result_type,
                }
            )
    bibliography_object["results"] = results
    bibliography_object["results_length"] = len(hits)
    bibliography_object["query_done"] = hits.done
    bibliography_object["result_type"] = result_type
    return bibliography_object, hits
Esempio n. 5
0
def kwic_hit_object(hit, config, db):
    """Build an individual kwic concordance"""
    # Get all metadata
    metadata_fields = {}
    for metadata in db.locals["metadata_fields"]:
        metadata_fields[metadata] = hit[metadata].strip()

    # Get all links and citations
    citation_hrefs = citation_links(db, config, hit)
    citation = citations(hit, citation_hrefs, config)

    # Determine length of text needed
    byte_distance = hit.bytes[-1] - hit.bytes[0]
    length = config.concordance_length + byte_distance + config.concordance_length

    # Get concordance and align it
    byte_offsets, start_byte = adjust_bytes(hit.bytes, config.concordance_length)
    conc_text = get_text(hit, start_byte, length, config.db_path)
    conc_text = format_strip(conc_text, db.locals["token_regex"], byte_offsets)
    conc_text = conc_text.replace("\n", " ")
    conc_text = conc_text.replace("\r", "")
    conc_text = conc_text.replace("\t", " ")
    try:
        start_hit = conc_text.index('<span class="highlight">')
        start_output = (
            '<span class="kwic-before"><span class="inner-before">' + conc_text[:start_hit] + "</span></span>"
        )
        end_hit = conc_text.rindex("</span>") + 7
        highlighted_text = conc_text[start_hit + 23 : end_hit - 7].lower()  # for use in KWIC sorting
        end_output = '<span class="kwic-after">' + conc_text[end_hit:] + "</span>"
        conc_text = (
            '<span class="kwic-text">'
            + start_output
            + '&nbsp;<span class="kwic-highlight">'
            + conc_text[start_hit:end_hit]
            + "</span>&nbsp;"
            + end_output
            + "</span>"
        )
    except ValueError as v:
        import sys

        print("KWIC ERROR:", v, file=sys.stderr)

    if config.kwic_formatting_regex:
        for pattern, replacement in config.kwic_formatting_regex:
            conc_text = re.sub(r"%s" % pattern, "%s" % replacement, conc_text)
    kwic_result = {
        "philo_id": hit.philo_id,
        "context": conc_text,
        "highlighted_text": highlighted_text,
        "metadata_fields": metadata_fields,
        "citation_links": citation_hrefs,
        "citation": citation,
        "bytes": hit.bytes,
    }

    return kwic_result
Esempio n. 6
0
def concordance_results(request, config):
    """Fetch concordances results."""
    db = DB(config.db_path + '/data/')
    if request.collocation_type:
        first_hits = db.query(request["q"], request["method"], request["arg"],
                              **request.metadata)
        second_hits = db.query(request["left"], request["method"],
                               request["arg"], **request.metadata)
        hits = CombinedHitlist(first_hits, second_hits)
    else:
        hits = db.query(request["q"],
                        request["method"],
                        request["arg"],
                        sort_order=request["sort_order"],
                        **request.metadata)
    start, end, page_num = page_interval(request['results_per_page'], hits,
                                         request.start, request.end)

    concordance_object = {
        "description": {
            "start": start,
            "end": end,
            "results_per_page": request.results_per_page
        },
        "query": dict([i for i in request]),
        "default_object": db.locals['default_object_level']
    }

    formatting_regexes = []
    if config.concordance_formatting_regex:
        for pattern, replacement in config.concordance_formatting_regex:
            compiled_regex = re.compile(r'%s' % pattern)
            formatting_regexes.append((compiled_regex, replacement))
    results = []
    for hit in hits[start - 1:end]:
        citation_hrefs = citation_links(db, config, hit)
        metadata_fields = {}
        for metadata in db.locals['metadata_fields']:
            metadata_fields[metadata] = hit[metadata]
        citation = citations(hit, citation_hrefs, config, report="concordance")
        context = get_concordance_text(db, hit, config.db_path,
                                       config.concordance_length)
        if formatting_regexes:
            for formatting_regex, replacement in formatting_regexes:
                context = formatting_regex.sub(r'%s' % replacement, context)
        result_obj = {
            "philo_id": hit.philo_id,
            "citation": citation,
            "citation_links": citation_hrefs,
            "context": context,
            "metadata_fields": metadata_fields,
            "bytes": hit.bytes
        }
        results.append(result_obj)
    concordance_object["results"] = results
    concordance_object['results_length'] = len(hits)
    concordance_object["query_done"] = hits.done
    return concordance_object
Esempio n. 7
0
def kwic_hit_object(hit, config, db):
    """Build an individual kwic concordance"""
    # Get all metadata
    metadata_fields = {}
    for metadata in db.locals["metadata_fields"]:
        metadata_fields[metadata] = hit[metadata].strip()

    # Get all links and citations
    citation_hrefs = citation_links(db, config, hit)
    citation = citations(hit, citation_hrefs, config)

    # Determine length of text needed
    byte_distance = hit.bytes[-1] - hit.bytes[0]
    length = config.concordance_length + byte_distance + config.concordance_length

    # Get concordance and align it
    byte_offsets, start_byte = adjust_bytes(hit.bytes, config.concordance_length)
    conc_text = get_text(hit, start_byte, length, config.db_path)
    conc_text = format_strip(conc_text, db.locals["token_regex"], byte_offsets)
    conc_text = conc_text.replace("\n", " ")
    conc_text = conc_text.replace("\r", "")
    conc_text = conc_text.replace("\t", " ")
    try:
        start_hit = conc_text.index('<span class="highlight">')
        start_output = (
            '<span class="kwic-before"><span class="inner-before">' + conc_text[:start_hit] + "</span></span>"
        )
        end_hit = conc_text.rindex("</span>") + 7
        highlighted_text = conc_text[start_hit + 23 : end_hit - 7].lower()  # for use in KWIC sorting
        end_output = '<span class="kwic-after">' + conc_text[end_hit:] + "</span>"
        conc_text = (
            '<span class="kwic-text">'
            + start_output
            + '&nbsp;<span class="kwic-highlight">'
            + conc_text[start_hit:end_hit]
            + "</span>&nbsp;"
            + end_output
            + "</span>"
        )
    except ValueError as v:
        import sys

        print("KWIC ERROR:", v, file=sys.stderr)

    if config.kwic_formatting_regex:
        for pattern, replacement in config.kwic_formatting_regex:
            conc_text = re.sub(r"%s" % pattern, "%s" % replacement, conc_text)
    kwic_result = {
        "philo_id": hit.philo_id,
        "context": conc_text,
        "highlighted_text": highlighted_text,
        "metadata_fields": metadata_fields,
        "citation_links": citation_hrefs,
        "citation": citation,
        "bytes": hit.bytes,
    }

    return kwic_result
Esempio n. 8
0
def concordance_results(request, config):
    """Fetch concordances results."""
    db = DB(config.db_path + '/data/')
    if request.collocation_type:
        first_hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
        second_hits = db.query(request["left"], request["method"], request["arg"], **request.metadata)
        hits = CombinedHitlist(first_hits, second_hits)
    else:
        hits = db.query(request["q"],
                        request["method"],
                        request["arg"],
                        sort_order=request["sort_order"],
                        **request.metadata)
    start, end, page_num = page_interval(request['results_per_page'], hits, request.start, request.end)

    concordance_object = {
        "description": {"start": start,
                        "end": end,
                        "results_per_page": request.results_per_page},
        "query": dict([i for i in request]),
        "default_object": db.locals['default_object_level']
    }

    formatting_regexes = []
    if config.concordance_formatting_regex:
        for pattern, replacement in config.concordance_formatting_regex:
            compiled_regex = re.compile(r'%s' % pattern)
            formatting_regexes.append((compiled_regex, replacement))
    results = []
    for hit in hits[start - 1:end]:
        citation_hrefs = citation_links(db, config, hit)
        metadata_fields = {}
        for metadata in db.locals['metadata_fields']:
            metadata_fields[metadata] = hit[metadata]
        citation = citations(hit, citation_hrefs, config, report="concordance")
        context = get_concordance_text(db, hit, config.db_path, config.concordance_length)
        if formatting_regexes:
            for formatting_regex, replacement in formatting_regexes:
                context = formatting_regex.sub(r'%s' % replacement, context)
        result_obj = {
            "philo_id": hit.philo_id,
            "citation": citation,
            "citation_links": citation_hrefs,
            "context": context,
            "metadata_fields": metadata_fields,
            "bytes": hit.bytes
        }
        results.append(result_obj)
    concordance_object["results"] = results
    concordance_object['results_length'] = len(hits)
    concordance_object["query_done"] = hits.done
    return concordance_object
Esempio n. 9
0
def group_by_metadata(request, config):
    citation_types = json.loads(request.citation)
    db = DB(config.db_path + "/data/")
    cursor = db.dbh.cursor()
    query = """select * from toms where philo_type="doc" and %s=?""" % request.group_by_field
    cursor.execute(query, (request.query,))
    result_group = []
    for doc in cursor:
        obj = db[doc["philo_id"]]
        links = citation_links(db, config, obj)
        citation = citations(obj, links, config, report="landing_page", citation_type=citation_types)
        result_group.append({"metadata": get_all_metadata(db, doc), "citation": citation})
    return json.dumps(
        {
            "display_count": request.display_count,
            "content_type": request.group_by_field,
            "content": [{"prefix": request.query, "results": result_group}],
        }
    )
Esempio n. 10
0
def group_by_range(request_range, request, config):
    db = DB(config.db_path + "/data/")
    metadata_queried = request.group_by_field
    citation_types = json.loads(request.citation)
    is_date = False
    try:
        int(request_range[0])
        int(request_range[1])
        is_date = True
    except ValueError:
        pass

    cursor = db.dbh.cursor()
    if is_date:
        content_type = "date"
        query_range = set(range(int(request_range[0]), int(request_range[1])))
        cursor.execute('select * from toms where philo_type="doc"')
    else:
        content_type = metadata_queried
        query_range = set(
            range(ord(request_range[0]),
                  ord(request_range[1]) +
                  1))  # Ordinal avoids unicode issues...
        cursor.execute(
            'select *, count(*) as count from toms where philo_type="doc" group by %s'
            % metadata_queried)
    try:
        cursor.execute(
            'select *, count(*) as count from toms where philo_type="doc" group by %s'
            % metadata_queried)
    except sqlite3.OperationalError:
        return json.dumps({
            "display_count": request.display_count,
            "content_type": content_type,
            "content": []
        })
    content = {}
    date_count = defaultdict(int)
    for doc in cursor:
        normalized_test_value = ""
        if doc[metadata_queried] is None:
            continue
        if is_date:
            try:
                initial = int(doc[metadata_queried])
                test_value = initial
                date_count[initial] += 1
            except:
                continue
        else:
            try:
                initial_letter = doc[metadata_queried][0].lower()
            except IndexError:
                # we have an empty string
                continue
            try:
                test_value = ord(initial_letter)
                normalized_test_value = ord("".join([
                    i for i in unicodedata.normalize("NFKD", initial_letter)
                    if not unicodedata.combining(i)
                ]))
            except TypeError:
                continue
            initial = initial_letter.upper()
        # Are we within the range?
        if test_value in query_range or normalized_test_value in query_range:
            if normalized_test_value in query_range:
                initial = "".join([
                    i for i in unicodedata.normalize("NFKD", initial_letter)
                    if not unicodedata.combining(i)
                ]).upper()
            obj = db[doc["philo_id"]]
            links = citation_links(db, config, obj)
            citation = citations(obj,
                                 links,
                                 config,
                                 report="landing_page",
                                 citation_type=citation_types)
            if initial not in content:
                content[initial] = []
            if is_date:
                try:
                    normalized_field = unaccent.smash_accents(
                        doc["title"]).lower()
                except:
                    normalized_field = None
                content[initial].append({
                    "metadata": get_all_metadata(db, doc),
                    "citation": citation,
                    "count": date_count[initial],
                    "normalized": normalized_field,
                })
            else:
                content[initial].append({
                    "metadata":
                    get_all_metadata(db, doc),
                    "citation":
                    citation,
                    "count":
                    doc["count"],
                    "normalized":
                    unaccent.smash_accents(doc[metadata_queried]).lower(),
                })
    results = []
    for prefix, result_set in sorted(content.items(), key=itemgetter(0)):
        results.append({
            "prefix":
            prefix,
            "results":
            sorted(result_set, key=lambda x: x["normalized"])
        })
    return json.dumps({
        "display_count": request.display_count,
        "content_type": content_type,
        "content": results
    })
def filter_words_by_property(request, config):
    """Filter words by property"""
    db = DB(config.db_path + '/data/')
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    concordance_object = {"query": dict([i for i in request])}

    # Do these need to be captured in wsgi_handler?
    word_property = request["word_property"]
    word_property_value = request["word_property_value"]
    word_property_total = request["word_property_total"]

    new_hitlist = []
    results = []
    position = 0
    more_pages = False

    if request.start == 0:
        start = 1
    else:
        start = request.start

    for hit in hits:
        # get my chunk of text
        hit_val = get_word_attrib(hit, word_property, db)

        if hit_val == word_property_value:
            position += 1
            if position < start:
                continue
            new_hitlist.append(hit)
            citation_hrefs = citation_links(db, config, hit)
            metadata_fields = {}
            for metadata in db.locals['metadata_fields']:
                metadata_fields[metadata] = hit[metadata]
            citation = citations(hit, citation_hrefs, config)
            context = get_concordance_text(db, hit, config.db_path, config.concordance_length)
            result_obj = {
                "philo_id": hit.philo_id,
                "citation": citation,
                "citation_links": citation_hrefs,
                "context": context,
                "metadata_fields": metadata_fields,
                "bytes": hit.bytes,
                "collocate_count": 1
            }
            results.append(result_obj)

        if len(new_hitlist) == (request.results_per_page):
            more_pages = True
            break

    end = start + len(results) - 1
    if len(results) < request.results_per_page:
        word_property_total = end
    else:
        word_property_total = end + 1
    concordance_object['results'] = results
    concordance_object["query_done"] = hits.done
    concordance_object['results_length'] = word_property_total
    concordance_object["description"] = {
        "start": start,
        "end": end,
        "results_per_page": request.results_per_page,
        "more_pages": more_pages
    }
    return concordance_object
Esempio n. 12
0
def group_by_range(request_range, request, config):
    db = DB(config.db_path + '/data/')
    metadata_queried = request.group_by_field
    citation_types = json.loads(request.citation)
    is_date = False
    try:
        int(request_range[0])
        int(request_range[1])
        is_date = True
    except ValueError:
        pass
    if is_date:
        content_type = "date"
        query_range = set(range(int(request_range[0]), int(request_range[1])))
    else:
        content_type = metadata_queried
        query_range = set(range(
            ord(request_range[0]),
            ord(request_range[1]) + 1))  # Ordinal avoids unicode issues...
    c = db.dbh.cursor()
    try:
        c.execute('select *, count(*) as count from toms where philo_type="doc" group by %s' % metadata_queried)
    except sqlite3.OperationalError:
        return json.dumps({
            "display_count": request.display_count,
            "content_type": content_type,
            "content": []
        })
    content = {}
    for doc in c.fetchall():
        normalized_test_value = ''
        if doc[metadata_queried] is None:
            continue
        if is_date:
            try:
                initial = int(doc[metadata_queried])
                test_value = initial
            except:
                continue
        else:
            try:
                initial_letter = doc[metadata_queried][0].lower()
            except IndexError:
                # we have an empty string
                continue
            test_value = ord(initial_letter)
            normalized_test_value = ord(''.join(
                [i
                 for i in unicodedata.normalize("NFKD", initial_letter)
                 if not unicodedata.combining(i)]))
            initial = initial_letter.upper()
        # Are we within the range?
        if test_value in query_range or normalized_test_value in query_range:
            if normalized_test_value in query_range:
                initial = ''.join(
                    [i
                     for i in unicodedata.normalize("NFKD", initial_letter)
                     if not unicodedata.combining(i)]).upper()
            obj = db[doc["philo_id"]]
            links = citation_links(db, config, obj)
            citation = citations(obj, links, config, report="landing_page", citation_type=citation_types)
            if initial not in content:
                content[initial] = []
            content[initial].append({
                "metadata": get_all_metadata(db, doc),
                "citation": citation,
                "count": doc['count']
            })
    results = []
    for result_set in sorted(content.items(), key=itemgetter(0)):
        results.append({"prefix": result_set[0], "results": result_set[1]})
    return json.dumps({"display_count": request.display_count,
                       "content_type": content_type,
                       "content": results})
Esempio n. 13
0
def group_by_range(request_range, request, config):
    db = DB(config.db_path + '/data/')
    metadata_queried = request.group_by_field
    citation_types = json.loads(request.citation)
    is_date = False
    try:
        int(request_range[0])
        int(request_range[1])
        is_date = True
    except ValueError:
        pass
    if is_date:
        content_type = "date"
        query_range = set(range(int(request_range[0]), int(request_range[1])))
    else:
        content_type = metadata_queried
        query_range = set(range(
            ord(request_range[0]),
            ord(request_range[1]) + 1))  # Ordinal avoids unicode issues...
    cursor = db.dbh.cursor()
    try:
        cursor.execute('select *, count(*) as count from toms where philo_type="doc" group by %s' % metadata_queried)
    except sqlite3.OperationalError:
        return json.dumps({
            "display_count": request.display_count,
            "content_type": content_type,
            "content": []
        })
    content = {}
    for doc in cursor:
        normalized_test_value = ''
        if doc[metadata_queried] is None:
            continue
        if is_date:
            try:
                initial = int(doc[metadata_queried])
                test_value = initial
            except:
                continue
        else:
            try:
                initial_letter = doc[metadata_queried][0].lower()
            except IndexError:
                # we have an empty string
                continue
            test_value = ord(initial_letter)
            normalized_test_value = ord(''.join(
                [i
                 for i in unicodedata.normalize("NFKD", initial_letter)
                 if not unicodedata.combining(i)]))
            initial = initial_letter.upper()
        # Are we within the range?
        if test_value in query_range or normalized_test_value in query_range:
            if normalized_test_value in query_range:
                initial = ''.join(
                    [i
                     for i in unicodedata.normalize("NFKD", initial_letter)
                     if not unicodedata.combining(i)]).upper()
            obj = db[doc["philo_id"]]
            links = citation_links(db, config, obj)
            citation = citations(obj, links, config, report="landing_page", citation_type=citation_types)
            if initial not in content:
                content[initial] = []
            content[initial].append({
                "metadata": get_all_metadata(db, doc),
                "citation": citation,
                "count": doc['count']
            })
    results = []
    for result_set in sorted(content.items(), key=itemgetter(0)):
        results.append({"prefix": result_set[0], "results": result_set[1]})
    return json.dumps({"display_count": request.display_count,
                       "content_type": content_type,
                       "content": results})
Esempio n. 14
0
def generate_toc_object(request, config):
    """This function fetches all philo_ids for div elements within a doc"""
    db = DB(config.db_path + '/data/')
    conn = db.dbh
    cursor = conn.cursor()
    try:
        obj = db[request.philo_id]
    except ValueError:
        philo_id = ' '.join(request.path_components[:-1])
        obj = db[philo_id]
    doc_id = int(obj.philo_id[0])
    next_doc_id = doc_id + 1
    # find the starting rowid for this doc
    cursor.execute('select rowid from toms where philo_id="%d 0 0 0 0 0 0"' % doc_id)
    start_rowid = cursor.fetchone()[0]
    # find the starting rowid for the next doc
    cursor.execute('select rowid from toms where philo_id="%d 0 0 0 0 0 0"' % next_doc_id)
    try:
        end_rowid = cursor.fetchone()[0]
    except TypeError:  # if this is the last doc, just get the last rowid in the table.
        cursor.execute('select max(rowid) from toms;')
        end_rowid = cursor.fetchone()[0]

    # use start_rowid and end_rowid to fetch every div in the document.
    philo_slices = {"doc": 1, "div1": 2, "div2": 3, "div3": 4, "para": 5}
    text_hierarchy = []
    cursor.execute("select * from toms where rowid >= ? and rowid <=? and philo_type>='div' and philo_type<='div3'",
                   (start_rowid, end_rowid))
    for row in cursor:
        philo_id = [int(n) for n in row["philo_id"].split(" ")]
        text = HitWrapper.ObjectWrapper(philo_id, db, row=row)
        if text['philo_name'] == '__philo_virtual' and text["philo_type"] != "div1":
            continue
        elif text['word_count'] == 0:
            continue
        else:
            philo_id = text['philo_id']
            philo_type = text['philo_type']
            display_name = ""
            if text['philo_name'] == "front":
                display_name = "Front Matter"
            elif text['philo_name'] == "note":
                continue
            else:
                display_name = text['head']
                if display_name:
                    display_name = display_name.strip()
                if not display_name:
                    if text["type"] and text["n"]:
                        display_name = text['type'] + " " + text["n"]
                    else:
                        display_name = text["head"] or text['type'] or text['philo_name'] or text['philo_type']
                        if display_name == "__philo_virtual":
                            display_name = text['philo_type']
            display_name = display_name[0].upper() + display_name[1:]
            link = make_absolute_object_link(config, philo_id.split()[:philo_slices[philo_type]])
            philo_id = ' '.join(philo_id.split()[:philo_slices[philo_type]])
            toc_element = {"philo_id": philo_id, "philo_type": philo_type, "label": display_name, "href": link}
            text_hierarchy.append(toc_element)
    metadata_fields = {}
    for metadata in db.locals['metadata_fields']:
        if db.locals['metadata_types'][metadata] == "doc":
            metadata_fields[metadata] = obj[metadata]
    citation_hrefs = citation_links(db, config, obj)
    citation = citations(obj, citation_hrefs, config, report="navigation")
    toc_object = {"query": dict([i for i in request]),
                  "philo_id": obj.philo_id,
                  "toc": text_hierarchy,
                  "metadata_fields": metadata_fields,
                  "citation": citation}
    return toc_object
def filter_words_by_property(request, config):
    """Filter words by property"""
    db = DB(config.db_path + "/data/")
    hits = db.query(request["q"], request["method"], request["arg"], **request.metadata)
    concordance_object = {"query": dict([i for i in request])}

    # Do these need to be captured in wsgi_handler?
    word_property = request["word_property"]
    word_property_value = request["word_property_value"]
    word_property_total = request["word_property_total"]

    new_hitlist = []
    results = []
    position = 0
    more_pages = False

    if request.start == 0:
        start = 1
    else:
        start = request.start

    for hit in hits:
        # get my chunk of text
        hit_val = get_word_attrib(hit, word_property, db)

        if hit_val == word_property_value:
            position += 1
            if position < start:
                continue
            new_hitlist.append(hit)
            citation_hrefs = citation_links(db, config, hit)
            metadata_fields = {}
            for metadata in db.locals["metadata_fields"]:
                metadata_fields[metadata] = hit[metadata]
            citation = citations(hit, citation_hrefs, config)
            context = get_concordance_text(db, hit, config.db_path, config.concordance_length)
            result_obj = {
                "philo_id": hit.philo_id,
                "citation": citation,
                "citation_links": citation_hrefs,
                "context": context,
                "metadata_fields": metadata_fields,
                "bytes": hit.bytes,
                "collocate_count": 1,
            }
            results.append(result_obj)

        if len(new_hitlist) == (request.results_per_page):
            more_pages = True
            break

    end = start + len(results) - 1
    if len(results) < request.results_per_page:
        word_property_total = end
    else:
        word_property_total = end + 1
    concordance_object["results"] = results
    concordance_object["query_done"] = hits.done
    concordance_object["results_length"] = word_property_total
    concordance_object["description"] = {
        "start": start,
        "end": end,
        "results_per_page": request.results_per_page,
        "more_pages": more_pages,
    }
    return concordance_object
def generate_toc_object(request, config):
    """This function fetches all philo_ids for div elements within a doc"""
    db = DB(config.db_path + "/data/")
    conn = db.dbh
    cursor = conn.cursor()
    try:
        obj = db[request.philo_id]
    except ValueError:
        philo_id = " ".join(request.path_components[:-1])
        obj = db[philo_id]
    doc_id = int(obj.philo_id[0])
    next_doc_id = doc_id + 1
    # find the starting rowid for this doc
    cursor.execute('select rowid from toms where philo_id="%d 0 0 0 0 0 0"' %
                   doc_id)
    start_rowid = cursor.fetchone()[0]
    # find the starting rowid for the next doc
    cursor.execute('select rowid from toms where philo_id="%d 0 0 0 0 0 0"' %
                   next_doc_id)
    try:
        end_rowid = cursor.fetchone()[0]
    except TypeError:  # if this is the last doc, just get the last rowid in the table.
        cursor.execute("select max(rowid) from toms;")
        end_rowid = cursor.fetchone()[0]

    # use start_rowid and end_rowid to fetch every div in the document.
    philo_slices = {"doc": 1, "div1": 2, "div2": 3, "div3": 4, "para": 5}
    text_hierarchy = []
    cursor.execute(
        "select * from toms where rowid >= ? and rowid <=? and philo_type>='div' and philo_type<='div3'",
        (start_rowid, end_rowid),
    )
    for row in cursor:
        philo_id = [int(n) for n in row["philo_id"].split(" ")]
        text = HitWrapper.ObjectWrapper(philo_id, db, row=row)
        if text["philo_name"] == "__philo_virtual" and text[
                "philo_type"] != "div1":
            continue
        elif text["word_count"] == 0:
            continue
        else:
            philo_id = text["philo_id"]
            philo_type = text["philo_type"]
            display_name = ""
            if text["philo_name"] == "front":
                display_name = "Front Matter"
            elif text["philo_name"] == "note":
                continue
            else:
                display_name = text["head"]
                if display_name:
                    display_name = display_name.strip()
                if not display_name:
                    if text["type"] and text["n"]:
                        display_name = text["type"] + " " + text["n"]
                    else:
                        display_name = text["head"] or text["type"] or text[
                            "philo_name"] or text["philo_type"]
                        if display_name == "__philo_virtual":
                            display_name = text["philo_type"]
            display_name = display_name[0].upper() + display_name[1:]
            link = make_absolute_object_link(
                config,
                philo_id.split()[:philo_slices[philo_type]])
            philo_id = " ".join(philo_id.split()[:philo_slices[philo_type]])
            toc_element = {
                "philo_id": philo_id,
                "philo_type": philo_type,
                "label": display_name,
                "href": link
            }
            text_hierarchy.append(toc_element)
    metadata_fields = {}
    for metadata in db.locals["metadata_fields"]:
        if db.locals["metadata_types"][metadata] == "doc":
            metadata_fields[metadata] = obj[metadata]
    citation_hrefs = citation_links(db, config, obj)
    citation = citations(obj,
                         citation_hrefs,
                         config,
                         report="table_of_contents")
    toc_object = {
        "query": dict([i for i in request]),
        "philo_id": obj.philo_id,
        "toc": text_hierarchy,
        "metadata_fields": metadata_fields,
        "citation": citation,
    }
    return toc_object