def freq_service(environ,start_response):
    status = '200 OK' # HTTP Status
    headers = [('Content-type', 'text/html; charset=UTF-8')] # HTTP Headers
    start_response(status, headers)
    environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],keep_blank_values=True)
    # a wsgi app is supposed to return an iterable;
    # yielding lets you stream, rather than generate everything at once.
    if "philologic_dbname" in environ:
        dbname = environ["philologic_dbname"]
    else:
        dbname = environ["parsed_params"]["philologic_dbname"][0]
    myname = environ["SCRIPT_NAME"]
    dbpath = "/var/lib/philologic/databases/" + dbname
    db = PhiloDB(dbpath,7)
    obj = []
    count = 0
    corpus_file = None
    corpus_size = 7
    corpus_count = 0
    
    qs = environ["parsed_params"]["query"][0]
    q_start = int(environ["parsed_params"].get('q_start',[0])[0]) or 1
    q_end = int(environ["parsed_params"].get('q_end',[0])[0]) or q_start + 2000 
    width = int(environ["parsed_params"].get("width",[0])[0]) or 100
    q_title = environ["parsed_params"].get("title",[""])[0] or ""
    q_author = environ["parsed_params"].get("author",[""])[0] or ""
    field = environ["parsed_params"].get("field",[""])[0] or "author"
    #need a field param.  author default?
    status = "running query for %s @ %s: " % (qs,dbname)

    metadata = {}
    if q_author:
        metadata["author"] = q_author
    if q_title:
        metadata["title"] = q_title

    content = ""
    q = db.query(qs,**metadata)
    
    while len(q) <= q_end and not q.done:
        time.sleep(.05)
        q.update()

    l = len(q)
    if q_end > l:
        q_end = l
    status += "%d total hits. aggregating." % (l)

    last_doc = -1
    authors = {}
    titles = {}
        
    yield("<html>")
    yield("<head>")
    yield("<title>%s: frequency table for \"%s\"</title>" % (dbname, qs))
    #opensearch metadata in meta tags here.
    #total length, start index, hits per page.
    yield("</head>\n")
    yield("<body>\n")
    yield("<p class='description'>%s</p>\n" % status)
    if l > 0:
        for hit in q: #need to page q
            byte_offset = hit[6]
            offsets = list(hit[6:])
            offsets.reverse()
            
            doc_id = hit[0]
            if doc_id > last_doc:
                metadata = db.toms[doc_id]
            author = metadata["author"]
            title = metadata["title"]
    
            if author in authors:
                authors[author] += 1
            else:
                authors[author] = 1
            if title in titles:
                titles[title] += 1
            else:
                titles[title] = 1

        yield("<table class='philologic_frequency tablesorter' title='author'>\n")
        yield("<thead><tr class='philologic_frequency_header_row'><th>Author</th><th>Frequency</th></tr></thead>\n<tbody>")
        for n,f in sorted(authors.items(),key=lambda x:x[1], reverse=True):
            url = "./?query=%s&author=%s&title=%s" % (qs,n,q_title)
            yield("  <tr class='philologic_frequency_row'>\n")
            yield("    <td class='philologic_frequency_key'><a href='%s'>%s</a></td>\n" % (urllib.quote_plus(url,"/?&="),n)) #UGLY ENCODING HACK
            yield("    <td class='philologic_frequency_value'>%s</td>\n" % f)
            yield("  </tr>\n")
        yield("</tbody></table>\n")
        # shouldn't dump more than necessary.  go by field.  
        # should also have links to more pages of table.
#        yield("<table class='philologic_frequency' title='title'>\n")
#         for n,f in sorted(titles.items(),key=lambda x:x[1], reverse=True):
#             #url = "./?query=%s&author=%s&title=%s" % (qs,n,q_title)
#             yield("  <tr class='philologic_frequency_row'>\n")
#             yield("    <td class='philologic_frequency_key'><a href='%s'>%s</a></td>\n" % (url,n))
#             yield("    <td class='philologic_frequency_value'>%s</td>\n" % f)
#             yield("  </tr>\n")
#         yield("</table>\n")
    yield("</body>")
    yield("</html>")
Exemple #2
0
def freq_json_service(environ,start_response):
    status = '200 OK' # HTTP Status
    headers = [('Content-type', 'application/json; charset=UTF-8')] # HTTP Headers
    start_response(status, headers)
    environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],keep_blank_values=True)
    # a wsgi app is supposed to return an iterable;
    # yielding lets you stream, rather than generate everything at once.
    if "philologic_dbname" in environ:
        dbname = environ["philologic_dbname"]
    else:
        dbname = environ["parsed_params"]["philologic_dbname"][0]
    myname = environ["SCRIPT_NAME"]
    dbpath = "/var/lib/philologic/databases/" + dbname
    db = PhiloDB(dbpath,7)
    obj = []
    count = 0
    corpus_file = None
    corpus_size = 7
    corpus_count = 0
    metadata_fields = ["author","title","date"]
    query_metadata = {}
    
    qs = environ["parsed_params"]["query"][0]
    query_method = environ["parsed_params"].get("query_method",[0])[0] or None
    query_arg = environ["parsed_params"].get("query_arg",[0])[0] or 0

    q_start = int(environ["parsed_params"].get('q_start',[0])[0]) or 1
    q_end = int(environ["parsed_params"].get('q_end',[0])[0]) or q_start + 1999 
    width = int(environ["parsed_params"].get("width",[0])[0]) or 100
    
    for meta_f in metadata_fields:
        if meta_f in environ["parsed_params"]:
            query_metadata[meta_f] = environ["parsed_params"][meta_f][0]
    
    field = environ["parsed_params"].get("field",[""])[0] or "author"
    #need a field param.  author default?
    status = "running query for %s @ %s: " % (qs,dbname)

    content = ""
    q = db.query(qs,query_method,query_arg,**query_metadata)
    
    while len(q) <= q_end and not q.done:
        time.sleep(.05)
        q.update()

    l = len(q)
    if q_end > l:
        q_end = l
    status += "%d total hits. aggregating." % (l)

    last_doc = -1
    authors = {}
    titles = {}
    counts = {}
    totalcounts = {}
    decades = {}
    #opensearch metadata in meta tags here.
    #total length, start index, hits per page.
    if l > 0:
        for hit in q[q_start - 1:q_end]: #need to page q
            byte_offset = hit[6]
            offsets = list(hit[6:])
            offsets.reverse()
            doc_id = hit[0]
            if doc_id > last_doc:
                metadata = db.toms[doc_id]
            
            label = metadata[field]
            if field == "date":
                date = label
                label = "%s0 - %s9" % (date[:-1], date[:-1])
                decades[label] = date[:-1] + "%"
            counts[label] = counts.get(label,0) + 1
            if doc_id != last_doc:
                count = metadata["word_count"]
        
        result = []

        filter_metadata = dict(query_metadata.items()) 
        for n,f in sorted(counts.items(),key=lambda x:x[1], reverse=True):
            if field == "date":
                total = sum(int(ob["word_count"]) for ob in db.toms.dbh.execute("select word_count from toms where date like '%s';" % decades[n]))
                filter_metadata[field] = decades[n]
            else:
                total = sum(int(ob["word_count"]) for ob in db.toms.dbh.execute("select word_count from toms where %s=?;" % field,(n,)))                        
                filter_metadata[field] = n
            rate = float(f) / total
            url = make_link(qs,query_method,query_arg,**filter_metadata)
            result.append( {"label":n,"count":f,"total_count":total,"rate":rate,"url":url})

    pages = []
    page_width = q_end - q_start + 1
    page_metadata = dict(query_metadata.items())
    page_metadata["report"] = "frequency"
    page_metadata["field"] = field
    for p_start in range(q_end + 1,l,page_width):
        p_end = min(l,p_start + page_width - 1)
        p_url = make_link(qs,query_method,query_arg,start=p_start,end=p_end,**page_metadata)
        pages.append(p_url)

    wrapper = {"length":l,"remaining_pages":pages,"result":result,"q_start":q_start,"q_end":q_end,"field":field}
    yield(json.dumps(wrapper,indent=1))
def colloc_filter_service(environ, start_response):
    status = '200 OK' # HTTP Status
    headers = [('Content-type', 'text/html; charset=UTF-8')] # HTTP Headers
    start_response(status, headers)
    environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],keep_blank_values=True)
    # a wsgi app is supposed to return an iterable;
    # yielding lets you stream, rather than generate everything at once.
    if "philologic_dbname" in environ:
        dbname = environ["philologic_dbname"]
    else:
        dbname = environ["parsed_params"]["philologic_dbname"][0]
    myname = environ["SCRIPT_NAME"]
    dbpath = "/var/lib/philologic/databases/" + dbname
    db = PhiloDB(dbpath,7)
    obj = []
    count = 0
    corpus_file = None
    corpus_size = 7
    corpus_count = 0
    
    qs = environ["parsed_params"]["query"][0]
    q_start = int(environ["parsed_params"].get('q_start',[0])[0]) or 1
    q_end = int(environ["parsed_params"].get('q_end',[0])[0]) or q_start + 49  
    width = int(environ["parsed_params"].get("width",[0])[0]) or 100
    query_method = environ["parsed_params"].get("query_method",[0])[0] or None
    query_arg = environ["parsed_params"].get("query_arg",[0])[0] or 0
    
    q_title = environ["parsed_params"].get("title",[""])[0] or ""
    q_author = environ["parsed_params"].get("author",[""])[0] or ""
    filter_word = environ["parsed_params"]["colloc_filter"][0]
    status = "running query for %s @ %s: " % (qs,dbname)

    metadata = {}
    if q_author:
        metadata["author"] = q_author
    if q_title:
        metadata["title"] = q_title

    metadata_fields = ["author","title","date"]
    query_metadata = {}
    
    for meta_f in metadata_fields:
        if meta_f in environ["parsed_params"]:
            query_metadata[meta_f] = environ["parsed_params"][meta_f][0]    

    content = ""
    q = db.query(qs,query_method,query_arg,**query_metadata)
    
    while len(q) <= q_end and not q.done:
        time.sleep(.05)
        q.update()

    l = len(q)
    if q_end > l:
        q_end = l
    status += "%d total hits. filtering for '%s'." % (l,filter_word)

    last_doc = -1
    collocates = {}
    text_file = None
    
    yield("<html>\n")
    yield("<head>")
    yield("<title>%s: collocation results %d to %d for \"%s\" filtered by \"%s\"</title>" % (dbname, q_start,q_end,qs, filter_word))
    #opensearch metadata in meta tags here.
    #total length, start index, hits per page.
    yield("</head>\n")
    yield("<body>\n")
    yield "<div class='philologic_concordance'>"
    yield("<p class='description'>%s</p>\n" % status)
    if l > 0:
        f_count = 0
        yield "<ol start='%d'>\n" % q_start
        for hit in q:
            doc_id = q.get_doc(hit)
            offsets = q.get_bytes(hit)
            offsets.sort()
            first_offset = offsets[0]
            filename = db.toms[doc_id]["filename"]
            author = db.toms[doc_id]["author"]
            title = db.toms[doc_id]["title"]                        
            date = db.toms[doc_id]["date"]
            
            url_comps = [str(doc_id)]
            for id_comp in hit[1:]:
                id_comp = str(id_comp)
                if url_comps + [id_comp] in db.toms:
                    url_comps += [id_comp]
            url = "/".join(["."] + url_comps )

            conc_start = first_offset - 100
            if conc_start < 0: conc_start = 0
                        
            if doc_id > last_doc:
                filename = db.toms[doc_id]["filename"]
                last_doc = doc_id
                text_path = dbpath + "/TEXT/" + filename
                text_file = open(text_path)

            text_file.seek(conc_start)
            text = text_file.read(width * 2)

            
            #trim the text
            need_l_trim = re.search("^[^<]*>",text)
            if need_l_trim:
                l_trim_off = need_l_trim.end(0)
                text = text[l_trim_off:]
            else:
                l_trim_off = 0
                
            need_r_trim = re.search("<[^>]*$",text)
            if need_r_trim:
                r_trim_off = need_r_trim.start(0)
                text = text[:r_trim_off]
            else:
                r_trim_off = 0
            
            conc_start += l_trim_off
            if filter_word in [t.group(2) for t in list(re.finditer(r"(<[^>]+>)|(\w+)",text))[1:-1]]:
                # then we have a virtual "hit":
                f_count += 1
                if f_count < q_start:
                    pass
                elif f_count > q_end:
                    break
                else:
                    yield "<li class='philologic_occurence'>"
    
                    yield "<a href='%s' class='philologic_cite'>" % url # should be a link
                    yield "<span class='philologic_property' title='author'>%s</span>, " % author
                    yield "<span class='philologic_property' title='title'>%s</span>: " % title
                    yield "<span class='philologic_property' title='date'>(%s)</span>" % date
                    yield "</a>\n"
    
                    p_start = conc_start - len("<div class='philologic_context'>...") + l_trim_off
                    parser = shlaxtree.TokenizingParser(p_start,offsets)
                    parser.feed("<div class='philologic_context'>..." + text + "...</div>\n")
                    tree = parser.close()
                    transform(tree)
                    context = shlaxtree.ElementTree.tostring(tree,encoding='utf-8')
                    yield context
                    yield "</li>\n"

        yield "</ol>"
        query_metadata["colloc_filter"] = filter_word
        next_url = make_link(qs,query_method,query_arg,q_end + 1,q_end + 50, **query_metadata)
        yield "<a href='%s'>more</a>" % next_url
    yield "</div>"
    yield("</body>")
    yield("</html>")
def colloc_service(environ, start_response):
    status = '200 OK'  # HTTP Status
    headers = [('Content-type', 'text/html; charset=UTF-8')]  # HTTP Headers
    start_response(status, headers)
    environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],
                                                 keep_blank_values=True)
    # a wsgi app is supposed to return an iterable;
    # yielding lets you stream, rather than generate everything at once.
    if "philologic_dbname" in environ:
        dbname = environ["philologic_dbname"]
    else:
        dbname = environ["parsed_params"]["philologic_dbname"][0]
    myname = environ["SCRIPT_NAME"]
    dbpath = "/var/lib/philologic/databases/" + dbname
    db = PhiloDB(dbpath, 7)
    obj = []
    count = 0
    corpus_file = None
    corpus_size = 7
    corpus_count = 0

    qs = environ["parsed_params"]["query"][0]
    q_start = int(environ["parsed_params"].get('q_start', [0])[0]) or 0
    q_end = int(environ["parsed_params"].get('q_end', [0])[0]) or q_start + 50
    width = int(environ["parsed_params"].get("width", [0])[0]) or 100
    q_title = environ["parsed_params"].get("title", [""])[0] or ""
    q_author = environ["parsed_params"].get("author", [""])[0] or ""
    status = "running query for %s @ %s: " % (qs, dbname)

    metadata = {}
    if q_author:
        metadata["author"] = q_author
    if q_title:
        metadata["title"] = q_title

    content = ""
    q = db.query(qs, **metadata)

    while len(q) <= q_end and not q.done:
        time.sleep(.05)
        q.update()

    l = len(q)
    if q_end > l:
        q_end = l
    status += "%d total hits. aggregating." % (l)

    last_doc = -1
    collocates = {}
    text_file = None

    yield ("<html>\n")
    yield ("<head>")
    yield ("<title>%s: collocation table for \"%s\"</title>" % (dbname, qs))
    #opensearch metadata in meta tags here.
    #total length, start index, hits per page.
    yield ("</head>\n")
    yield ("<body>\n")
    yield ("<p class='description'>%s</p>\n" % status)
    if l > 0:
        for hit in q:
            doc_id = q.get_doc(hit)
            offsets = q.get_bytes(hit)
            first_offset = offsets[0]

            conc_start = first_offset - 100
            if conc_start < 0: conc_start = 0

            if doc_id > last_doc:
                filename = db.toms[doc_id]["filename"]
                last_doc = doc_id
                text_path = dbpath + "/TEXT/" + filename
                text_file = open(text_path)

            text_file.seek(conc_start)
            text = text_file.read(width * 2)

            #trim the text
            need_l_trim = re.search("^[^<]*>", text)
            if need_l_trim:
                l_trim_off = need_l_trim.end(0)
                text = text[l_trim_off:]
            else:
                l_trim_off = 0

            need_r_trim = re.search("<[^>]*$", text)
            if need_r_trim:
                r_trim_off = need_r_trim.start(0)
                text = text[:r_trim_off]
            else:
                r_trim_off = 0

            conc_start += l_trim_off
            for token in list(re.finditer(r"(<[^>]+>)|(\w+)", text))[1:-1]:
                if token.group(2):
                    t_off = token.start(2) + conc_start
                    if t_off not in offsets:
                        t_type = token.group(2)
                        if t_type in collocates:
                            collocates[t_type] += 1
                        else:
                            collocates[t_type] = 1

        yield ("<table class='philologic_collocation'>\n")
        for n, f in sorted(collocates.items(),
                           key=lambda x: x[1],
                           reverse=True):
            url = "./?query=%s&author=%s&title=%s&colloc_filter=%s" % (
                qs, q_author, q_title, n)
            yield ("  <tr class='philologic_collocation_row'>\n")
            yield (
                "    <td class='philologic_collocation_key'><a href='%s'>%s</a></td>\n"
                % (url, n))
            yield ("    <td class='philologic_collocation_value'>%s</td>\n" %
                   f)
            yield ("  </tr>\n")
        yield ("</table>\n")

    yield ("</body>")
    yield ("</html>")
def conc_service(environ, start_response):
    status = '200 OK'  # HTTP Status
    headers = [('Content-type', 'text/html; charset=UTF-8')]  # HTTP Headers
    start_response(status, headers)
    environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],
                                                 keep_blank_values=True)
    # a wsgi app is supposed to return an iterable;
    # yielding lets you stream, rather than generate everything at once.
    if "philologic_dbname" in environ:
        dbname = environ["philologic_dbname"]
    else:
        dbname = environ["parsed_params"]["philologic_dbname"][0]
    myname = environ["SCRIPT_NAME"]
    dbpath = "/var/lib/philologic/databases/" + dbname
    db = PhiloDB(dbpath, 7)
    obj = []
    count = 0
    corpus_file = None
    corpus_size = 7
    corpus_count = 0

    qs = environ["parsed_params"]["query"][0]
    query_method = environ["parsed_params"].get("query_method", [0])[0] or None
    query_arg = environ["parsed_params"].get("query_arg", [0])[0] or 0
    q_start = int(
        environ["parsed_params"].get('q_start', [0])[0]
    ) or 1  # better checking.  this doesn't catch 0...which helps for now.
    q_end = int(environ["parsed_params"].get('q_end', [
        0
    ])[0]) or q_start + 49  # fine.  python lists are exclusive at the end.
    width = int(environ["parsed_params"].get("width", [0])[0]) or 400
    status = "running query for %s @ %s with arg %s" % (qs, dbname, query_arg)

    metadata_fields = db.locals["metadata_fields"]
    query_metadata = {}

    for meta_f in metadata_fields:
        if meta_f in environ["parsed_params"]:
            query_metadata[meta_f] = environ["parsed_params"][meta_f][0]

    content = ""
    q = db.query(qs, query_method, query_arg, **query_metadata)

    while not q.done:
        time.sleep(.05)
        q.update()

    l = len(q)
    if q_end > l:
        q_end = l
    status += ".  displaying %d - %d of %d hits." % (q_start, q_end, l)

    yield ("<html>")
    yield ("<head>")
    yield ("<title>%s: search for %s</title>" % (dbname, qs))
    #opensearch metadata in meta tags here.
    #total length, start index, hits per page.
    yield ("</head>")
    yield ("<body>")
    yield ("<div class='philologic_concordance'>")
    yield ("<p class='description'>%s</p>" % status)
    if l > 0:
        yield "<ol start='%d'>" % q_start
        for hit in q[q_start - 1:q_end]:
            doc_id = q.get_doc(hit)
            offsets = q.get_bytes(hit)
            offsets.sort()
            first_offset = offsets[0]
            doc = db.toms[doc_id]
            div1 = db.toms[hit[:2]]
            div2 = db.toms[hit[:3]]
            filename = doc["filename"]

            url = hit_to_link(db, hit)

            yield "<li class='philologic_occurence'>\n"

            yield db.locals["make_cite"](db, hit, url)

            yield ("</a>\n")

            conc_start = first_offset - width
            if conc_start < 0: conc_start = 0
            text_path = dbpath + "/TEXT/" + filename
            text_file = open(text_path)
            text_file.seek(conc_start)
            text = text_file.read(width * 2)
            context = format_stream(text, conc_start, offsets)
            yield (context)

            yield ("</li>\n")
        yield ("</ol>")
        pages = l / 50 + 1
        more = ""

        yield ("<div class='more'>")
        prev_off = 1
        next_off = min(prev_off + 49, l)
        p_count = 0

        while True:
            new_uri = make_link(qs,
                                query_method,
                                query_arg,
                                start=prev_off,
                                end=next_off,
                                **query_metadata)
            yield "<a href='%s'>%d-%d</a> " % (new_uri, prev_off, next_off)
            if prev_off < q_start:
                yield "... "
                prev_off = q_start + 50
            else:
                prev_off += 50
            next_off = min(prev_off + 49, l)
            p_count += 1
            if p_count > 10: break
            if next_off == l: break
        last_page = 50 * (l // 50) + 1
        if prev_off <= last_page:
            if prev_off < last_page:
                yield "... "
            prev_off = last_page
            next_off = l
            new_uri = make_link(qs,
                                query_method,
                                query_arg,
                                start=prev_off,
                                end=next_off,
                                **query_metadata)
            yield "<a href='%s'>%d-%d</a> " % (new_uri, prev_off, next_off)
        yield ("</div>")
    yield ("</div>")
    yield ("</body>")
    yield ("</html>")
Exemple #6
0
def simple_dispatch(environ, start_response):
    status = '200 OK' # HTTP Status
    headers = [('Content-type', 'text/html')] # HTTP Headers
    start_response(status, headers)
    environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],keep_blank_values=True)
    # a wsgi app is supposed to return an iterable;
    # yielding lets you stream, rather than generate everything at once.
    # yield repr(environ)
    dbname = shift_path_info(environ)
    myname = environ["SCRIPT_NAME"]
    if not dbname: return
    dbpath = "/var/lib/philologic/databases/" + dbname
    db = PhiloDB(dbpath,7)
    toms =  db.toms
    obj = []
    count = 0
    corpus_file = None
    corpus_size = 7
    corpus_count = 0
    if "meta1field" in environ["parsed_params"] and environ["parsed_params"]["meta1field"][0] and "meta1" in environ["parsed_params"] and environ["parsed_params"]["meta1"][0]:
        c_field = environ["parsed_params"]["meta1field"][0]
        c_value = environ["parsed_params"]["meta1"][0]
        meta_dict = {c_field:c_value}
    else:
        meta_dict = {}
    if "query" in environ["parsed_params"]:
        qs = environ["parsed_params"]["query"][0]
        q_start = int(environ["parsed_params"].get('q_start',[0])[0]) or 0
        q_end = int(environ["parsed_params"].get('q_end',[0])[0]) or q_start + 50        
        f = DirtyFormatter.Formatter({})

        yield "<html><body><div class='conc_response'>running query for '%s'<br/>" % qs 
        print >> environ["wsgi.errors"], str(corpus_file)

        q = db.query(qs,**meta_dict)

        while len(q) <= q_end and not q.done:
            time.sleep(.05)
            q.update()
        l = len(q)
        yield "<div class='status' n=%d%s>%d hits</div>" % (l," done='true'" if q.done else "", l)
        
        yield "<div class='results'>"
        for hit in q[q_start:q_end]:
            context_file = ""
            context_file_end = 0
            rstr = ""
            yield "<div class='hit' id='%s' offset=%d>" % ("/".join(str(x) for x in hit[:6]),hit[6])

            #get the metadata for all unique parents of the hit.
            for i in range(1,len(hit)):
                hit_parent = hit[:i]
                if hit[i-1] == 0:
                    continue
                if hit_parent in toms:
                    parent = toms[hit_parent]
                    rstr += format_object(parent,"")
                    if parent["filename"]:
                        context_file = parent["filename"]
                        context_file_end = parent["end"]

            yield "<span class='cite'>" + rstr + "</span>"
            path = dbpath + "/TEXT/" + context_file
            offset = hit[6] #dangerous...
            (left,word,right) = get_raw_context(path,offset,context_file_end,250)
            (left,word,right) = [f.format(x) for x in (left,word,right)]
            content = "<span class='left'>%s</span> <span class='word'>%s</span> <span class='right'>%s</right>" % (left,word,right)
            yield "<span class='content'>%s</span>" % content
            yield "</div>"
            count += 1

        yield "</div>%d hits" % count 
        yield "</div></body></html>"
def colloc_filter_service(environ, start_response):
    status = '200 OK'  # HTTP Status
    headers = [('Content-type', 'text/html; charset=UTF-8')]  # HTTP Headers
    start_response(status, headers)
    environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],
                                                 keep_blank_values=True)
    # a wsgi app is supposed to return an iterable;
    # yielding lets you stream, rather than generate everything at once.
    if "philologic_dbname" in environ:
        dbname = environ["philologic_dbname"]
    else:
        dbname = environ["parsed_params"]["philologic_dbname"][0]
    myname = environ["SCRIPT_NAME"]
    dbpath = "/var/lib/philologic/databases/" + dbname
    db = PhiloDB(dbpath, 7)
    obj = []
    count = 0
    corpus_file = None
    corpus_size = 7
    corpus_count = 0

    qs = environ["parsed_params"]["query"][0]
    q_start = int(environ["parsed_params"].get('q_start', [0])[0]) or 1
    q_end = int(environ["parsed_params"].get('q_end', [0])[0]) or q_start + 49
    width = int(environ["parsed_params"].get("width", [0])[0]) or 100
    query_method = environ["parsed_params"].get("query_method", [0])[0] or None
    query_arg = environ["parsed_params"].get("query_arg", [0])[0] or 0

    q_title = environ["parsed_params"].get("title", [""])[0] or ""
    q_author = environ["parsed_params"].get("author", [""])[0] or ""
    filter_word = environ["parsed_params"]["colloc_filter"][0]
    status = "running query for %s @ %s: " % (qs, dbname)

    metadata = {}
    if q_author:
        metadata["author"] = q_author
    if q_title:
        metadata["title"] = q_title

    metadata_fields = ["author", "title", "date"]
    query_metadata = {}

    for meta_f in metadata_fields:
        if meta_f in environ["parsed_params"]:
            query_metadata[meta_f] = environ["parsed_params"][meta_f][0]

    content = ""
    q = db.query(qs, query_method, query_arg, **query_metadata)

    while len(q) <= q_end and not q.done:
        time.sleep(.05)
        q.update()

    l = len(q)
    if q_end > l:
        q_end = l
    status += "%d total hits. filtering for '%s'." % (l, filter_word)

    last_doc = -1
    collocates = {}
    text_file = None

    yield ("<html>\n")
    yield ("<head>")
    yield (
        "<title>%s: collocation results %d to %d for \"%s\" filtered by \"%s\"</title>"
        % (dbname, q_start, q_end, qs, filter_word))
    #opensearch metadata in meta tags here.
    #total length, start index, hits per page.
    yield ("</head>\n")
    yield ("<body>\n")
    yield "<div class='philologic_concordance'>"
    yield ("<p class='description'>%s</p>\n" % status)
    if l > 0:
        f_count = 0
        yield "<ol start='%d'>\n" % q_start
        for hit in q:
            doc_id = q.get_doc(hit)
            offsets = q.get_bytes(hit)
            offsets.sort()
            first_offset = offsets[0]
            filename = db.toms[doc_id]["filename"]
            author = db.toms[doc_id]["author"]
            title = db.toms[doc_id]["title"]
            date = db.toms[doc_id]["date"]

            url_comps = [str(doc_id)]
            for id_comp in hit[1:]:
                id_comp = str(id_comp)
                if url_comps + [id_comp] in db.toms:
                    url_comps += [id_comp]
            url = "/".join(["."] + url_comps)

            conc_start = first_offset - 100
            if conc_start < 0: conc_start = 0

            if doc_id > last_doc:
                filename = db.toms[doc_id]["filename"]
                last_doc = doc_id
                text_path = dbpath + "/TEXT/" + filename
                text_file = open(text_path)

            text_file.seek(conc_start)
            text = text_file.read(width * 2)

            #trim the text
            need_l_trim = re.search("^[^<]*>", text)
            if need_l_trim:
                l_trim_off = need_l_trim.end(0)
                text = text[l_trim_off:]
            else:
                l_trim_off = 0

            need_r_trim = re.search("<[^>]*$", text)
            if need_r_trim:
                r_trim_off = need_r_trim.start(0)
                text = text[:r_trim_off]
            else:
                r_trim_off = 0

            conc_start += l_trim_off
            if filter_word in [
                    t.group(2)
                    for t in list(re.finditer(r"(<[^>]+>)|(\w+)", text))[1:-1]
            ]:
                # then we have a virtual "hit":
                f_count += 1
                if f_count < q_start:
                    pass
                elif f_count > q_end:
                    break
                else:
                    yield "<li class='philologic_occurence'>"

                    yield "<a href='%s' class='philologic_cite'>" % url  # should be a link
                    yield "<span class='philologic_property' title='author'>%s</span>, " % author
                    yield "<span class='philologic_property' title='title'>%s</span>: " % title
                    yield "<span class='philologic_property' title='date'>(%s)</span>" % date
                    yield "</a>\n"

                    p_start = conc_start - len(
                        "<div class='philologic_context'>...") + l_trim_off
                    parser = shlaxtree.TokenizingParser(p_start, offsets)
                    parser.feed("<div class='philologic_context'>..." + text +
                                "...</div>\n")
                    tree = parser.close()
                    transform(tree)
                    context = shlaxtree.ElementTree.tostring(tree,
                                                             encoding='utf-8')
                    yield context
                    yield "</li>\n"

        yield "</ol>"
        query_metadata["colloc_filter"] = filter_word
        next_url = make_link(qs, query_method, query_arg, q_end + 1,
                             q_end + 50, **query_metadata)
        yield "<a href='%s'>more</a>" % next_url
    yield "</div>"
    yield ("</body>")
    yield ("</html>")
def colloc_service(environ,start_response):
    status = '200 OK' # HTTP Status
    headers = [('Content-type', 'text/html; charset=UTF-8')] # HTTP Headers
    start_response(status, headers)
    environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],keep_blank_values=True)
    # a wsgi app is supposed to return an iterable;
    # yielding lets you stream, rather than generate everything at once.
    if "philologic_dbname" in environ:
        dbname = environ["philologic_dbname"]
    else:
        dbname = environ["parsed_params"]["philologic_dbname"][0]
    myname = environ["SCRIPT_NAME"]
    dbpath = "/var/lib/philologic/databases/" + dbname
    db = PhiloDB(dbpath,7)
    obj = []
    count = 0
    corpus_file = None
    corpus_size = 7
    corpus_count = 0
    
    qs = environ["parsed_params"]["query"][0]
    q_start = int(environ["parsed_params"].get('q_start',[0])[0]) or 0
    q_end = int(environ["parsed_params"].get('q_end',[0])[0]) or q_start + 50  
    width = int(environ["parsed_params"].get("width",[0])[0]) or 100
    q_title = environ["parsed_params"].get("title",[""])[0] or ""
    q_author = environ["parsed_params"].get("author",[""])[0] or ""
    status = "running query for %s @ %s: " % (qs,dbname)

    metadata = {}
    if q_author:
        metadata["author"] = q_author
    if q_title:
        metadata["title"] = q_title

    content = ""
    q = db.query(qs,**metadata)
    
    while len(q) <= q_end and not q.done:
        time.sleep(.05)
        q.update()

    l = len(q)
    if q_end > l:
        q_end = l
    status += "%d total hits. aggregating." % (l)

    last_doc = -1
    collocates = {}
    text_file = None
    
    yield("<html>\n")
    yield("<head>")
    yield("<title>%s: collocation table for \"%s\"</title>" % (dbname, qs))
    #opensearch metadata in meta tags here.
    #total length, start index, hits per page.
    yield("</head>\n")
    yield("<body>\n")
    yield("<p class='description'>%s</p>\n" % status)
    if l > 0:
        for hit in q:
            doc_id = q.get_doc(hit)
            offsets = q.get_bytes(hit)
            first_offset = offsets[0]

            conc_start = first_offset - 100
            if conc_start < 0: conc_start = 0
                        
            if doc_id > last_doc:
                filename = db.toms[doc_id]["filename"]
                last_doc = doc_id
                text_path = dbpath + "/TEXT/" + filename
                text_file = open(text_path)                       

            text_file.seek(conc_start)
            text = text_file.read(width * 2)

            
            #trim the text
            need_l_trim = re.search("^[^<]*>",text)
            if need_l_trim:
                l_trim_off = need_l_trim.end(0)
                text = text[l_trim_off:]
            else:
                l_trim_off = 0
                
            need_r_trim = re.search("<[^>]*$",text)
            if need_r_trim:
                r_trim_off = need_r_trim.start(0)
                text = text[:r_trim_off]
            else:
                r_trim_off = 0
            
            conc_start += l_trim_off
            for token in list(re.finditer(r"(<[^>]+>)|(\w+)",text))[1:-1]:
                if token.group(2):
                    t_off = token.start(2) + conc_start
                    if t_off not in offsets:
                        t_type = token.group(2)
                        if t_type in collocates:
                            collocates[t_type] += 1
                        else:
                            collocates[t_type] = 1

        yield("<table class='philologic_collocation'>\n")
        for n,f in sorted(collocates.items(),key=lambda x:x[1],reverse=True):
            url = "./?query=%s&author=%s&title=%s&colloc_filter=%s" % (qs,q_author,q_title,n)
            yield("  <tr class='philologic_collocation_row'>\n")
            yield("    <td class='philologic_collocation_key'><a href='%s'>%s</a></td>\n" % (url,n))
            yield("    <td class='philologic_collocation_value'>%s</td>\n" % f)
            yield("  </tr>\n")
        yield("</table>\n")


    yield("</body>")
    yield("</html>")
def conc_service(environ, start_response):
    status = '200 OK' # HTTP Status
    headers = [('Content-type', 'text/html; charset=UTF-8')] # HTTP Headers
    start_response(status, headers)
    environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],keep_blank_values=True)
    # a wsgi app is supposed to return an iterable;
    # yielding lets you stream, rather than generate everything at once.
    if "philologic_dbname" in environ:
        dbname = environ["philologic_dbname"]
    else:
        dbname = environ["parsed_params"]["philologic_dbname"][0]
    myname = environ["SCRIPT_NAME"]
    dbpath = "/var/lib/philologic/databases/" + dbname
    db = PhiloDB(dbpath,7)
    obj = []
    count = 0
    corpus_file = None
    corpus_size = 7
    corpus_count = 0
    
    qs = environ["parsed_params"]["query"][0]
    query_method = environ["parsed_params"].get("query_method",[0])[0] or None
    query_arg = environ["parsed_params"].get("query_arg",[0])[0] or 0
    q_start = int(environ["parsed_params"].get('q_start',[0])[0]) or 1 # better checking.  this doesn't catch 0...which helps for now.
    q_end = int(environ["parsed_params"].get('q_end',[0])[0]) or q_start + 49 # fine.  python lists are exclusive at the end.
    width = int(environ["parsed_params"].get("width",[0])[0]) or 100
    status = "running query for %s @ %s with arg %s" % (qs,dbname,query_arg)
    
    metadata_fields = ["author","title","date"]
    query_metadata = {}
    
    for meta_f in metadata_fields:
        if meta_f in environ["parsed_params"]:
            query_metadata[meta_f] = environ["parsed_params"][meta_f][0]    

    content = ""
    q = db.query(qs,query_method,query_arg,**query_metadata)
    
    while len(q) <= q_end and not q.done:
        time.sleep(.05)
        q.update()

    l = len(q)
    if q_end > l:
        q_end = l
    status += ".  displaying %d - %d of %d hits." % (q_start,q_end,l)
        
    yield("<html>")
    yield("<head>")
    yield("<title>%s: search for %s</title>" % (dbname, qs))
    #opensearch metadata in meta tags here.
    #total length, start index, hits per page.
    yield("</head>")
    yield("<body>")
    yield("<div class='philologic_concordance'>")
    yield("<p class='description'>%s</p>" % status)
    if l > 0:
        yield "<ol start='%d'>" % q_start
        for hit in q[q_start-1:q_end]:
            doc_id = q.get_doc(hit)
            offsets = q.get_bytes(hit)
            offsets.sort()
            first_offset = offsets[0]
            filename = db.toms[doc_id]["filename"]
            author = db.toms[doc_id]["author"]
            title = db.toms[doc_id]["title"]            
            date = db.toms[doc_id]["date"]

            url = hit_to_link(db,hit)
            yield "<li class='philologic_occurence'>\n"

            yield "<a class='philologic_cite' href='%s'>" % url 
            yield("<span class='philologic_property' title='author'>%s</span>, " % author)
            yield("<span class='philologic_property' title='title'>%s</span> " % title)
            yield("<span class='philologic_property' title='date'>(%s)</span>" % date)
            yield("</a>\n")
            conc_start = first_offset - width
            if conc_start < 0: conc_start = 0
            text_path = dbpath + "/TEXT/" + filename
            text_file = open(text_path)
            text_file.seek(conc_start)
            text = text_file.read(width * 2)
            
            #trim the text
            need_l_trim = re.search("^[^<]*>",text)
            if need_l_trim:
                l_trim_off = need_l_trim.end(0)
                text = text[l_trim_off:]
            else:
                l_trim_off = 0
                
            need_r_trim = re.search("<[^>]*$",text)
            if need_r_trim:
                r_trim_off = need_r_trim.start(0)
                text = text[:r_trim_off]
            else:
                r_trim_off = 0
                
            p_start = conc_start - len("<div class='philologic_context'>...") + l_trim_off
            parser = shlaxtree.TokenizingParser(p_start,offsets)
            parser.feed("<div class='philologic_context'>..." + text + "...</div>")
            tree = parser.close()
            transform(tree)
            context = shlaxtree.ElementTree.tostring(tree,encoding='utf-8')
            yield(context)

            yield("</li>\n")
        yield("</ol>")
        pages = l / 50 + 1
        more = ""

        yield("<div class='more'>")
        prev_off = 1
        next_off = min(prev_off + 49,l)
        p_count = 0

        while True:
            new_uri = make_link(qs,query_method,query_arg,start=prev_off,end=next_off,**query_metadata)
            yield "<a href='%s'>%d-%d</a> " % (new_uri,prev_off,next_off)
            if prev_off < q_start:
                yield "... "
                prev_off = q_start + 50
            else:
                prev_off += 50
            next_off = min(prev_off + 49,l)
            p_count += 1
            if p_count > 10: break
            if next_off == l: break
        last_page = 50 * (l // 50) + 1
        if prev_off <= last_page:
            if prev_off < last_page:
                yield "... "
            prev_off = last_page
            next_off = l
            new_uri = make_link(qs,query_method,query_arg,start=prev_off,end=next_off,**query_metadata)
            yield "<a href='%s'>%d-%d</a> " % (new_uri,prev_off,next_off)
        yield("</div>")
    yield("</div>")
    yield("</body>")
    yield("</html>")
def conc_service(environ, start_response):
    status = '200 OK'  # HTTP Status
    headers = [('Content-type', 'text/html; charset=UTF-8')]  # HTTP Headers
    start_response(status, headers)
    environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],
                                                 keep_blank_values=True)
    # a wsgi app is supposed to return an iterable;
    # yielding lets you stream, rather than generate everything at once.
    if "philologic_dbname" in environ:
        dbname = environ["philologic_dbname"]
    else:
        dbname = environ["parsed_params"]["philologic_dbname"][0]
    myname = environ["SCRIPT_NAME"]
    dbpath = "/var/lib/philologic/databases/" + dbname
    db = PhiloDB(dbpath, 7)
    obj = []
    count = 0
    corpus_file = None
    corpus_size = 7
    corpus_count = 0

    qs = environ["parsed_params"]["query"][0]
    query_method = environ["parsed_params"].get("query_method", [0])[0] or None
    query_arg = environ["parsed_params"].get("query_arg", [0])[0] or 0
    q_start = int(
        environ["parsed_params"].get('q_start', [0])[0]
    ) or 1  # better checking.  this doesn't catch 0...which helps for now.
    q_end = int(environ["parsed_params"].get('q_end', [
        0
    ])[0]) or q_start + 49  # fine.  python lists are exclusive at the end.
    width = int(environ["parsed_params"].get("width", [0])[0]) or 100
    status = "running query for %s @ %s with arg %s" % (qs, dbname, query_arg)

    metadata_fields = ["author", "title", "date"]
    query_metadata = {}

    for meta_f in metadata_fields:
        if meta_f in environ["parsed_params"]:
            query_metadata[meta_f] = environ["parsed_params"][meta_f][0]

    content = ""
    q = db.query(qs, query_method, query_arg, **query_metadata)

    while len(q) <= q_end and not q.done:
        time.sleep(.05)
        q.update()

    l = len(q)
    if q_end > l:
        q_end = l
    status += ".  displaying %d - %d of %d hits." % (q_start, q_end, l)

    yield ("<html>")
    yield ("<head>")
    yield ("<title>%s: search for %s</title>" % (dbname, qs))
    #opensearch metadata in meta tags here.
    #total length, start index, hits per page.
    yield ("</head>")
    yield ("<body>")
    yield ("<div class='philologic_concordance'>")
    yield ("<p class='description'>%s</p>" % status)
    if l > 0:
        yield "<ol start='%d'>" % q_start
        for hit in q[q_start - 1:q_end]:
            doc_id = q.get_doc(hit)
            offsets = q.get_bytes(hit)
            offsets.sort()
            first_offset = offsets[0]
            filename = db.toms[doc_id]["filename"]
            author = db.toms[doc_id]["author"]
            title = db.toms[doc_id]["title"]
            date = db.toms[doc_id]["date"]

            url = hit_to_link(db, hit)
            yield "<li class='philologic_occurence'>\n"

            yield "<a class='philologic_cite' href='%s'>" % url
            yield (
                "<span class='philologic_property' title='author'>%s</span>, "
                % author)
            yield (
                "<span class='philologic_property' title='title'>%s</span> " %
                title)
            yield (
                "<span class='philologic_property' title='date'>(%s)</span>" %
                date)
            yield ("</a>\n")
            conc_start = first_offset - width
            if conc_start < 0: conc_start = 0
            text_path = dbpath + "/TEXT/" + filename
            text_file = open(text_path)
            text_file.seek(conc_start)
            text = text_file.read(width * 2)

            #trim the text
            need_l_trim = re.search("^[^<]*>", text)
            if need_l_trim:
                l_trim_off = need_l_trim.end(0)
                text = text[l_trim_off:]
            else:
                l_trim_off = 0

            need_r_trim = re.search("<[^>]*$", text)
            if need_r_trim:
                r_trim_off = need_r_trim.start(0)
                text = text[:r_trim_off]
            else:
                r_trim_off = 0

            p_start = conc_start - len(
                "<div class='philologic_context'>...") + l_trim_off
            parser = shlaxtree.TokenizingParser(p_start, offsets)
            parser.feed("<div class='philologic_context'>..." + text +
                        "...</div>")
            tree = parser.close()
            transform(tree)
            context = shlaxtree.ElementTree.tostring(tree, encoding='utf-8')
            yield (context)

            yield ("</li>\n")
        yield ("</ol>")
        pages = l / 50 + 1
        more = ""

        yield ("<div class='more'>")
        prev_off = 1
        next_off = min(prev_off + 49, l)
        p_count = 0

        while True:
            new_uri = make_link(qs,
                                query_method,
                                query_arg,
                                start=prev_off,
                                end=next_off,
                                **query_metadata)
            yield "<a href='%s'>%d-%d</a> " % (new_uri, prev_off, next_off)
            if prev_off < q_start:
                yield "... "
                prev_off = q_start + 50
            else:
                prev_off += 50
            next_off = min(prev_off + 49, l)
            p_count += 1
            if p_count > 10: break
            if next_off == l: break
        last_page = 50 * (l // 50) + 1
        if prev_off <= last_page:
            if prev_off < last_page:
                yield "... "
            prev_off = last_page
            next_off = l
            new_uri = make_link(qs,
                                query_method,
                                query_arg,
                                start=prev_off,
                                end=next_off,
                                **query_metadata)
            yield "<a href='%s'>%d-%d</a> " % (new_uri, prev_off, next_off)
        yield ("</div>")
    yield ("</div>")
    yield ("</body>")
    yield ("</html>")
def colloc_json_service(environ,start_response):
    status = '200 OK' # HTTP Status
    headers = [('Content-type', 'application/json; charset=UTF-8')] # HTTP Headers
    start_response(status, headers)
    environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],keep_blank_values=True)
    # a wsgi app is supposed to return an iterable;
    # yielding lets you stream, rather than generate everything at once.
    if "philologic_dbname" in environ:
        dbname = environ["philologic_dbname"]
    else:
        dbname = environ["parsed_params"]["philologic_dbname"][0]
    myname = environ["SCRIPT_NAME"]
    dbpath = "/var/lib/philologic/databases/" + dbname
    db = PhiloDB(dbpath,7)
    obj = []
    count = 0
    corpus_file = None
    corpus_size = 7
    corpus_count = 0
    
    qs = environ["parsed_params"]["query"][0]
    query_method = environ["parsed_params"].get("query_method",[0])[0] or None
    query_arg = environ["parsed_params"].get("query_arg",[0])[0] or 0

    q_start = int(environ["parsed_params"].get('q_start',[0])[0]) or 1
    q_end = int(environ["parsed_params"].get('q_end',[0])[0]) or q_start + 499  
    width = int(environ["parsed_params"].get("width",[0])[0]) or 100
    status = "running query for %s @ %s: " % (qs,dbname)

    metadata_fields = ["author","title","date"]
    query_metadata = {}
    
    for meta_f in metadata_fields:
        if meta_f in environ["parsed_params"]:
            query_metadata[meta_f] = environ["parsed_params"][meta_f][0]    

    print >> sys.stderr, query_metadata
    content = ""
    q = db.query(qs,query_method,query_arg,**query_metadata)
    
    while len(q) <= q_end and not q.done:
        time.sleep(.05)
        q.update()

    l = len(q)
    if q_end > l:
        q_end = l
    print >> sys.stderr, "%d total hits. aggregating." % (l)

    last_doc = -1
    collocates = {}
    text_file = None
    if l > 0:
        for hit in q[q_start - 1:q_end]:
            doc_id = q.get_doc(hit)
            offsets = q.get_bytes(hit)
            offsets.reverse()
            first_offset = offsets[0]

            conc_start = first_offset - 100
            if conc_start < 0: conc_start = 0
                        
            if doc_id > last_doc:
                filename = db.toms[doc_id]["filename"]
                last_doc = doc_id
                text_path = dbpath + "/TEXT/" + filename
                text_file = open(text_path)                       

            text_file.seek(conc_start)
            text = text_file.read(width * 2)

            
            #trim the text
            need_l_trim = re.search("^[^<]*>",text)
            if need_l_trim:
                l_trim_off = need_l_trim.end(0)
                text = text[l_trim_off:]
            else:
                l_trim_off = 0
                
            need_r_trim = re.search("<[^>]*$",text)
            if need_r_trim:
                r_trim_off = need_r_trim.start(0)
                text = text[:r_trim_off]
            else:
                r_trim_off = 0
            
            conc_start += l_trim_off
            for token in list(re.finditer(r"(<[^>]+>)|(\w+)",text))[1:-1]:
                if token.group(2):
                    t_off = token.start(2) + conc_start
                    if t_off not in offsets:
                        t_type = token.group(2)
                        if t_type in collocates:
                            collocates[t_type] += 1
                        else:
                            collocates[t_type] = 1

        results = []
        total_words = 0
        page_width = q_end - q_start + 1
        for n,f in sorted(collocates.items(),key=lambda x:x[1],reverse=True):
            if f > 5: # UGLY!!!
                filter_metadata = dict(query_metadata.items())
                filter_metadata["colloc_filter"] = n
                url = make_link(qs,query_method,query_arg,**filter_metadata)
                results.append({"label":n,"count":f,"url":url,"rate":float(f)/l})
                total_words += f
    pages = []
    page_width = q_end - q_start + 1
    for p_start in range(q_end + 1,l,page_width):
        page_metadata = dict(query_metadata.items())
        page_metadata["field"] = "collocates"
        page_metadata["format"] = "json"
        page_metadata["report"] = "frequency"
        
        p_end = min(l,p_start + page_width - 1)
        p_url = make_link(qs,query_method,query_arg,start=p_start,end=p_end,**page_metadata)
        pages.append(p_url)
    wrapper = {"result":results,"remaining_pages":pages,"length":l,"q_start":q_start,"q_end":q_end,"total_words":total_words,"field":"word"}
    yield json.dumps(wrapper,indent=1)
def conc_service(environ, start_response):
    status = '200 OK' # HTTP Status
    headers = [('Content-type', 'text/html; charset=UTF-8')] # HTTP Headers
    start_response(status, headers)
    environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],keep_blank_values=True)
    # a wsgi app is supposed to return an iterable;
    # yielding lets you stream, rather than generate everything at once.
    if "philologic_dbname" in environ:
        dbname = environ["philologic_dbname"]
    else:
        dbname = environ["parsed_params"]["philologic_dbname"][0]
    myname = environ["SCRIPT_NAME"]
    dbpath = "/var/lib/philologic/databases/" + dbname
    db = PhiloDB(dbpath,7)
    obj = []
    count = 0
    corpus_file = None
    corpus_size = 7
    corpus_count = 0
    
    qs = environ["parsed_params"]["query"][0]
    query_method = environ["parsed_params"].get("query_method",[0])[0] or None
    query_arg = environ["parsed_params"].get("query_arg",[0])[0] or 0
    q_start = int(environ["parsed_params"].get('q_start',[0])[0]) or 1 # better checking.  this doesn't catch 0...which helps for now.
    q_end = int(environ["parsed_params"].get('q_end',[0])[0]) or q_start + 49 # fine.  python lists are exclusive at the end.
    width = int(environ["parsed_params"].get("width",[0])[0]) or 400
    status = "running query for %s @ %s with arg %s" % (qs,dbname,query_arg)
    
    metadata_fields = db.locals["metadata_fields"]
    query_metadata = {}
    
    for meta_f in metadata_fields:
        if meta_f in environ["parsed_params"]:
            query_metadata[meta_f] = environ["parsed_params"][meta_f][0]    

    content = ""
    q = db.query(qs,query_method,query_arg,**query_metadata)
    
    while not q.done:
        time.sleep(.05)
        q.update()

    l = len(q)
    if q_end > l:
        q_end = l
    status += ".  displaying %d - %d of %d hits." % (q_start,q_end,l)
        
    yield("<html>")
    yield("<head>")
    yield("<title>%s: search for %s</title>" % (dbname, qs))
    #opensearch metadata in meta tags here.
    #total length, start index, hits per page.
    yield("</head>")
    yield("<body>")
    yield("<div class='philologic_concordance'>")
    yield("<p class='description'>%s</p>" % status)
    if l > 0:
        yield "<ol start='%d'>" % q_start
        for hit in q[q_start-1:q_end]:
            doc_id = q.get_doc(hit)
            offsets = q.get_bytes(hit)
            offsets.sort()
            first_offset = offsets[0]
            doc = db.toms[doc_id]
            div1 = db.toms[hit[:2]]
            div2 = db.toms[hit[:3]]
            filename = doc["filename"]

            url = hit_to_link(db,hit)

            yield "<li class='philologic_occurence'>\n"

            yield db.locals["make_cite"](db,hit,url)

            yield("</a>\n")

            conc_start = first_offset - width
            if conc_start < 0: conc_start = 0
            text_path = dbpath + "/TEXT/" + filename
            text_file = open(text_path)
            text_file.seek(conc_start)
            text = text_file.read(width * 2)
            context = format_stream(text,conc_start,offsets)
            yield(context)

            yield("</li>\n")
        yield("</ol>")
        pages = l / 50 + 1
        more = ""

        yield("<div class='more'>")
        prev_off = 1
        next_off = min(prev_off + 49,l)
        p_count = 0

        while True:
            new_uri = make_link(qs,query_method,query_arg,start=prev_off,end=next_off,**query_metadata)
            yield "<a href='%s'>%d-%d</a> " % (new_uri,prev_off,next_off)
            if prev_off < q_start:
                yield "... "
                prev_off = q_start + 50
            else:
                prev_off += 50
            next_off = min(prev_off + 49,l)
            p_count += 1
            if p_count > 10: break
            if next_off == l: break
        last_page = 50 * (l // 50) + 1
        if prev_off <= last_page:
            if prev_off < last_page:
                yield "... "
            prev_off = last_page
            next_off = l
            new_uri = make_link(qs,query_method,query_arg,start=prev_off,end=next_off,**query_metadata)
            yield "<a href='%s'>%d-%d</a> " % (new_uri,prev_off,next_off)
        yield("</div>")
    yield("</div>")
    yield("</body>")
    yield("</html>")
Exemple #13
0
def colloc_json_service(environ, start_response):
    status = '200 OK'  # HTTP Status
    headers = [('Content-type', 'application/json; charset=UTF-8')
               ]  # HTTP Headers
    start_response(status, headers)
    environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],
                                                 keep_blank_values=True)
    # a wsgi app is supposed to return an iterable;
    # yielding lets you stream, rather than generate everything at once.
    if "philologic_dbname" in environ:
        dbname = environ["philologic_dbname"]
    else:
        dbname = environ["parsed_params"]["philologic_dbname"][0]
    myname = environ["SCRIPT_NAME"]
    dbpath = "/var/lib/philologic/databases/" + dbname
    db = PhiloDB(dbpath, 7)
    obj = []
    count = 0
    corpus_file = None
    corpus_size = 7
    corpus_count = 0

    qs = environ["parsed_params"]["query"][0]
    query_method = environ["parsed_params"].get("query_method", [0])[0] or None
    query_arg = environ["parsed_params"].get("query_arg", [0])[0] or 0

    q_start = int(environ["parsed_params"].get('q_start', [0])[0]) or 1
    q_end = int(environ["parsed_params"].get('q_end', [0])[0]) or q_start + 499
    width = int(environ["parsed_params"].get("width", [0])[0]) or 100
    status = "running query for %s @ %s: " % (qs, dbname)

    metadata_fields = ["author", "title", "date"]
    query_metadata = {}

    for meta_f in metadata_fields:
        if meta_f in environ["parsed_params"]:
            query_metadata[meta_f] = environ["parsed_params"][meta_f][0]

    print >> sys.stderr, query_metadata
    content = ""
    q = db.query(qs, query_method, query_arg, **query_metadata)

    while len(q) <= q_end and not q.done:
        time.sleep(.05)
        q.update()

    l = len(q)
    if q_end > l:
        q_end = l
    print >> sys.stderr, "%d total hits. aggregating." % (l)

    last_doc = -1
    collocates = {}
    text_file = None
    if l > 0:
        for hit in q[q_start - 1:q_end]:
            doc_id = q.get_doc(hit)
            offsets = q.get_bytes(hit)
            offsets.reverse()
            first_offset = offsets[0]

            conc_start = first_offset - 100
            if conc_start < 0: conc_start = 0

            if doc_id > last_doc:
                filename = db.toms[doc_id]["filename"]
                last_doc = doc_id
                text_path = dbpath + "/TEXT/" + filename
                text_file = open(text_path)

            text_file.seek(conc_start)
            text = text_file.read(width * 2)

            #trim the text
            need_l_trim = re.search("^[^<]*>", text)
            if need_l_trim:
                l_trim_off = need_l_trim.end(0)
                text = text[l_trim_off:]
            else:
                l_trim_off = 0

            need_r_trim = re.search("<[^>]*$", text)
            if need_r_trim:
                r_trim_off = need_r_trim.start(0)
                text = text[:r_trim_off]
            else:
                r_trim_off = 0

            conc_start += l_trim_off
            for token in list(re.finditer(r"(<[^>]+>)|(\w+)", text))[1:-1]:
                if token.group(2):
                    t_off = token.start(2) + conc_start
                    if t_off not in offsets:
                        t_type = token.group(2)
                        if t_type in collocates:
                            collocates[t_type] += 1
                        else:
                            collocates[t_type] = 1

        results = []
        total_words = 0
        page_width = q_end - q_start + 1
        for n, f in sorted(collocates.items(),
                           key=lambda x: x[1],
                           reverse=True):
            if f > 5:  # UGLY!!!
                filter_metadata = dict(query_metadata.items())
                filter_metadata["colloc_filter"] = n
                url = make_link(qs, query_method, query_arg, **filter_metadata)
                results.append({
                    "label": n,
                    "count": f,
                    "url": url,
                    "rate": float(f) / l
                })
                total_words += f
    pages = []
    page_width = q_end - q_start + 1
    for p_start in range(q_end + 1, l, page_width):
        page_metadata = dict(query_metadata.items())
        page_metadata["field"] = "collocates"
        page_metadata["format"] = "json"
        page_metadata["report"] = "frequency"

        p_end = min(l, p_start + page_width - 1)
        p_url = make_link(qs,
                          query_method,
                          query_arg,
                          start=p_start,
                          end=p_end,
                          **page_metadata)
        pages.append(p_url)
    wrapper = {
        "result": results,
        "remaining_pages": pages,
        "length": l,
        "q_start": q_start,
        "q_end": q_end,
        "total_words": total_words,
        "field": "word"
    }
    yield json.dumps(wrapper, indent=1)