def freq_service(environ,start_response): status = '200 OK' # HTTP Status headers = [('Content-type', 'text/html; charset=UTF-8')] # HTTP Headers start_response(status, headers) environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],keep_blank_values=True) # a wsgi app is supposed to return an iterable; # yielding lets you stream, rather than generate everything at once. if "philologic_dbname" in environ: dbname = environ["philologic_dbname"] else: dbname = environ["parsed_params"]["philologic_dbname"][0] myname = environ["SCRIPT_NAME"] dbpath = "/var/lib/philologic/databases/" + dbname db = PhiloDB(dbpath,7) obj = [] count = 0 corpus_file = None corpus_size = 7 corpus_count = 0 qs = environ["parsed_params"]["query"][0] q_start = int(environ["parsed_params"].get('q_start',[0])[0]) or 1 q_end = int(environ["parsed_params"].get('q_end',[0])[0]) or q_start + 2000 width = int(environ["parsed_params"].get("width",[0])[0]) or 100 q_title = environ["parsed_params"].get("title",[""])[0] or "" q_author = environ["parsed_params"].get("author",[""])[0] or "" field = environ["parsed_params"].get("field",[""])[0] or "author" #need a field param. author default? status = "running query for %s @ %s: " % (qs,dbname) metadata = {} if q_author: metadata["author"] = q_author if q_title: metadata["title"] = q_title content = "" q = db.query(qs,**metadata) while len(q) <= q_end and not q.done: time.sleep(.05) q.update() l = len(q) if q_end > l: q_end = l status += "%d total hits. aggregating." % (l) last_doc = -1 authors = {} titles = {} yield("<html>") yield("<head>") yield("<title>%s: frequency table for \"%s\"</title>" % (dbname, qs)) #opensearch metadata in meta tags here. #total length, start index, hits per page. yield("</head>\n") yield("<body>\n") yield("<p class='description'>%s</p>\n" % status) if l > 0: for hit in q: #need to page q byte_offset = hit[6] offsets = list(hit[6:]) offsets.reverse() doc_id = hit[0] if doc_id > last_doc: metadata = db.toms[doc_id] author = metadata["author"] title = metadata["title"] if author in authors: authors[author] += 1 else: authors[author] = 1 if title in titles: titles[title] += 1 else: titles[title] = 1 yield("<table class='philologic_frequency tablesorter' title='author'>\n") yield("<thead><tr class='philologic_frequency_header_row'><th>Author</th><th>Frequency</th></tr></thead>\n<tbody>") for n,f in sorted(authors.items(),key=lambda x:x[1], reverse=True): url = "./?query=%s&author=%s&title=%s" % (qs,n,q_title) yield(" <tr class='philologic_frequency_row'>\n") yield(" <td class='philologic_frequency_key'><a href='%s'>%s</a></td>\n" % (urllib.quote_plus(url,"/?&="),n)) #UGLY ENCODING HACK yield(" <td class='philologic_frequency_value'>%s</td>\n" % f) yield(" </tr>\n") yield("</tbody></table>\n") # shouldn't dump more than necessary. go by field. # should also have links to more pages of table. # yield("<table class='philologic_frequency' title='title'>\n") # for n,f in sorted(titles.items(),key=lambda x:x[1], reverse=True): # #url = "./?query=%s&author=%s&title=%s" % (qs,n,q_title) # yield(" <tr class='philologic_frequency_row'>\n") # yield(" <td class='philologic_frequency_key'><a href='%s'>%s</a></td>\n" % (url,n)) # yield(" <td class='philologic_frequency_value'>%s</td>\n" % f) # yield(" </tr>\n") # yield("</table>\n") yield("</body>") yield("</html>")
def freq_json_service(environ,start_response): status = '200 OK' # HTTP Status headers = [('Content-type', 'application/json; charset=UTF-8')] # HTTP Headers start_response(status, headers) environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],keep_blank_values=True) # a wsgi app is supposed to return an iterable; # yielding lets you stream, rather than generate everything at once. if "philologic_dbname" in environ: dbname = environ["philologic_dbname"] else: dbname = environ["parsed_params"]["philologic_dbname"][0] myname = environ["SCRIPT_NAME"] dbpath = "/var/lib/philologic/databases/" + dbname db = PhiloDB(dbpath,7) obj = [] count = 0 corpus_file = None corpus_size = 7 corpus_count = 0 metadata_fields = ["author","title","date"] query_metadata = {} qs = environ["parsed_params"]["query"][0] query_method = environ["parsed_params"].get("query_method",[0])[0] or None query_arg = environ["parsed_params"].get("query_arg",[0])[0] or 0 q_start = int(environ["parsed_params"].get('q_start',[0])[0]) or 1 q_end = int(environ["parsed_params"].get('q_end',[0])[0]) or q_start + 1999 width = int(environ["parsed_params"].get("width",[0])[0]) or 100 for meta_f in metadata_fields: if meta_f in environ["parsed_params"]: query_metadata[meta_f] = environ["parsed_params"][meta_f][0] field = environ["parsed_params"].get("field",[""])[0] or "author" #need a field param. author default? status = "running query for %s @ %s: " % (qs,dbname) content = "" q = db.query(qs,query_method,query_arg,**query_metadata) while len(q) <= q_end and not q.done: time.sleep(.05) q.update() l = len(q) if q_end > l: q_end = l status += "%d total hits. aggregating." % (l) last_doc = -1 authors = {} titles = {} counts = {} totalcounts = {} decades = {} #opensearch metadata in meta tags here. #total length, start index, hits per page. if l > 0: for hit in q[q_start - 1:q_end]: #need to page q byte_offset = hit[6] offsets = list(hit[6:]) offsets.reverse() doc_id = hit[0] if doc_id > last_doc: metadata = db.toms[doc_id] label = metadata[field] if field == "date": date = label label = "%s0 - %s9" % (date[:-1], date[:-1]) decades[label] = date[:-1] + "%" counts[label] = counts.get(label,0) + 1 if doc_id != last_doc: count = metadata["word_count"] result = [] filter_metadata = dict(query_metadata.items()) for n,f in sorted(counts.items(),key=lambda x:x[1], reverse=True): if field == "date": total = sum(int(ob["word_count"]) for ob in db.toms.dbh.execute("select word_count from toms where date like '%s';" % decades[n])) filter_metadata[field] = decades[n] else: total = sum(int(ob["word_count"]) for ob in db.toms.dbh.execute("select word_count from toms where %s=?;" % field,(n,))) filter_metadata[field] = n rate = float(f) / total url = make_link(qs,query_method,query_arg,**filter_metadata) result.append( {"label":n,"count":f,"total_count":total,"rate":rate,"url":url}) pages = [] page_width = q_end - q_start + 1 page_metadata = dict(query_metadata.items()) page_metadata["report"] = "frequency" page_metadata["field"] = field for p_start in range(q_end + 1,l,page_width): p_end = min(l,p_start + page_width - 1) p_url = make_link(qs,query_method,query_arg,start=p_start,end=p_end,**page_metadata) pages.append(p_url) wrapper = {"length":l,"remaining_pages":pages,"result":result,"q_start":q_start,"q_end":q_end,"field":field} yield(json.dumps(wrapper,indent=1))
def colloc_filter_service(environ, start_response): status = '200 OK' # HTTP Status headers = [('Content-type', 'text/html; charset=UTF-8')] # HTTP Headers start_response(status, headers) environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],keep_blank_values=True) # a wsgi app is supposed to return an iterable; # yielding lets you stream, rather than generate everything at once. if "philologic_dbname" in environ: dbname = environ["philologic_dbname"] else: dbname = environ["parsed_params"]["philologic_dbname"][0] myname = environ["SCRIPT_NAME"] dbpath = "/var/lib/philologic/databases/" + dbname db = PhiloDB(dbpath,7) obj = [] count = 0 corpus_file = None corpus_size = 7 corpus_count = 0 qs = environ["parsed_params"]["query"][0] q_start = int(environ["parsed_params"].get('q_start',[0])[0]) or 1 q_end = int(environ["parsed_params"].get('q_end',[0])[0]) or q_start + 49 width = int(environ["parsed_params"].get("width",[0])[0]) or 100 query_method = environ["parsed_params"].get("query_method",[0])[0] or None query_arg = environ["parsed_params"].get("query_arg",[0])[0] or 0 q_title = environ["parsed_params"].get("title",[""])[0] or "" q_author = environ["parsed_params"].get("author",[""])[0] or "" filter_word = environ["parsed_params"]["colloc_filter"][0] status = "running query for %s @ %s: " % (qs,dbname) metadata = {} if q_author: metadata["author"] = q_author if q_title: metadata["title"] = q_title metadata_fields = ["author","title","date"] query_metadata = {} for meta_f in metadata_fields: if meta_f in environ["parsed_params"]: query_metadata[meta_f] = environ["parsed_params"][meta_f][0] content = "" q = db.query(qs,query_method,query_arg,**query_metadata) while len(q) <= q_end and not q.done: time.sleep(.05) q.update() l = len(q) if q_end > l: q_end = l status += "%d total hits. filtering for '%s'." % (l,filter_word) last_doc = -1 collocates = {} text_file = None yield("<html>\n") yield("<head>") yield("<title>%s: collocation results %d to %d for \"%s\" filtered by \"%s\"</title>" % (dbname, q_start,q_end,qs, filter_word)) #opensearch metadata in meta tags here. #total length, start index, hits per page. yield("</head>\n") yield("<body>\n") yield "<div class='philologic_concordance'>" yield("<p class='description'>%s</p>\n" % status) if l > 0: f_count = 0 yield "<ol start='%d'>\n" % q_start for hit in q: doc_id = q.get_doc(hit) offsets = q.get_bytes(hit) offsets.sort() first_offset = offsets[0] filename = db.toms[doc_id]["filename"] author = db.toms[doc_id]["author"] title = db.toms[doc_id]["title"] date = db.toms[doc_id]["date"] url_comps = [str(doc_id)] for id_comp in hit[1:]: id_comp = str(id_comp) if url_comps + [id_comp] in db.toms: url_comps += [id_comp] url = "/".join(["."] + url_comps ) conc_start = first_offset - 100 if conc_start < 0: conc_start = 0 if doc_id > last_doc: filename = db.toms[doc_id]["filename"] last_doc = doc_id text_path = dbpath + "/TEXT/" + filename text_file = open(text_path) text_file.seek(conc_start) text = text_file.read(width * 2) #trim the text need_l_trim = re.search("^[^<]*>",text) if need_l_trim: l_trim_off = need_l_trim.end(0) text = text[l_trim_off:] else: l_trim_off = 0 need_r_trim = re.search("<[^>]*$",text) if need_r_trim: r_trim_off = need_r_trim.start(0) text = text[:r_trim_off] else: r_trim_off = 0 conc_start += l_trim_off if filter_word in [t.group(2) for t in list(re.finditer(r"(<[^>]+>)|(\w+)",text))[1:-1]]: # then we have a virtual "hit": f_count += 1 if f_count < q_start: pass elif f_count > q_end: break else: yield "<li class='philologic_occurence'>" yield "<a href='%s' class='philologic_cite'>" % url # should be a link yield "<span class='philologic_property' title='author'>%s</span>, " % author yield "<span class='philologic_property' title='title'>%s</span>: " % title yield "<span class='philologic_property' title='date'>(%s)</span>" % date yield "</a>\n" p_start = conc_start - len("<div class='philologic_context'>...") + l_trim_off parser = shlaxtree.TokenizingParser(p_start,offsets) parser.feed("<div class='philologic_context'>..." + text + "...</div>\n") tree = parser.close() transform(tree) context = shlaxtree.ElementTree.tostring(tree,encoding='utf-8') yield context yield "</li>\n" yield "</ol>" query_metadata["colloc_filter"] = filter_word next_url = make_link(qs,query_method,query_arg,q_end + 1,q_end + 50, **query_metadata) yield "<a href='%s'>more</a>" % next_url yield "</div>" yield("</body>") yield("</html>")
def colloc_service(environ, start_response): status = '200 OK' # HTTP Status headers = [('Content-type', 'text/html; charset=UTF-8')] # HTTP Headers start_response(status, headers) environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"], keep_blank_values=True) # a wsgi app is supposed to return an iterable; # yielding lets you stream, rather than generate everything at once. if "philologic_dbname" in environ: dbname = environ["philologic_dbname"] else: dbname = environ["parsed_params"]["philologic_dbname"][0] myname = environ["SCRIPT_NAME"] dbpath = "/var/lib/philologic/databases/" + dbname db = PhiloDB(dbpath, 7) obj = [] count = 0 corpus_file = None corpus_size = 7 corpus_count = 0 qs = environ["parsed_params"]["query"][0] q_start = int(environ["parsed_params"].get('q_start', [0])[0]) or 0 q_end = int(environ["parsed_params"].get('q_end', [0])[0]) or q_start + 50 width = int(environ["parsed_params"].get("width", [0])[0]) or 100 q_title = environ["parsed_params"].get("title", [""])[0] or "" q_author = environ["parsed_params"].get("author", [""])[0] or "" status = "running query for %s @ %s: " % (qs, dbname) metadata = {} if q_author: metadata["author"] = q_author if q_title: metadata["title"] = q_title content = "" q = db.query(qs, **metadata) while len(q) <= q_end and not q.done: time.sleep(.05) q.update() l = len(q) if q_end > l: q_end = l status += "%d total hits. aggregating." % (l) last_doc = -1 collocates = {} text_file = None yield ("<html>\n") yield ("<head>") yield ("<title>%s: collocation table for \"%s\"</title>" % (dbname, qs)) #opensearch metadata in meta tags here. #total length, start index, hits per page. yield ("</head>\n") yield ("<body>\n") yield ("<p class='description'>%s</p>\n" % status) if l > 0: for hit in q: doc_id = q.get_doc(hit) offsets = q.get_bytes(hit) first_offset = offsets[0] conc_start = first_offset - 100 if conc_start < 0: conc_start = 0 if doc_id > last_doc: filename = db.toms[doc_id]["filename"] last_doc = doc_id text_path = dbpath + "/TEXT/" + filename text_file = open(text_path) text_file.seek(conc_start) text = text_file.read(width * 2) #trim the text need_l_trim = re.search("^[^<]*>", text) if need_l_trim: l_trim_off = need_l_trim.end(0) text = text[l_trim_off:] else: l_trim_off = 0 need_r_trim = re.search("<[^>]*$", text) if need_r_trim: r_trim_off = need_r_trim.start(0) text = text[:r_trim_off] else: r_trim_off = 0 conc_start += l_trim_off for token in list(re.finditer(r"(<[^>]+>)|(\w+)", text))[1:-1]: if token.group(2): t_off = token.start(2) + conc_start if t_off not in offsets: t_type = token.group(2) if t_type in collocates: collocates[t_type] += 1 else: collocates[t_type] = 1 yield ("<table class='philologic_collocation'>\n") for n, f in sorted(collocates.items(), key=lambda x: x[1], reverse=True): url = "./?query=%s&author=%s&title=%s&colloc_filter=%s" % ( qs, q_author, q_title, n) yield (" <tr class='philologic_collocation_row'>\n") yield ( " <td class='philologic_collocation_key'><a href='%s'>%s</a></td>\n" % (url, n)) yield (" <td class='philologic_collocation_value'>%s</td>\n" % f) yield (" </tr>\n") yield ("</table>\n") yield ("</body>") yield ("</html>")
def conc_service(environ, start_response): status = '200 OK' # HTTP Status headers = [('Content-type', 'text/html; charset=UTF-8')] # HTTP Headers start_response(status, headers) environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"], keep_blank_values=True) # a wsgi app is supposed to return an iterable; # yielding lets you stream, rather than generate everything at once. if "philologic_dbname" in environ: dbname = environ["philologic_dbname"] else: dbname = environ["parsed_params"]["philologic_dbname"][0] myname = environ["SCRIPT_NAME"] dbpath = "/var/lib/philologic/databases/" + dbname db = PhiloDB(dbpath, 7) obj = [] count = 0 corpus_file = None corpus_size = 7 corpus_count = 0 qs = environ["parsed_params"]["query"][0] query_method = environ["parsed_params"].get("query_method", [0])[0] or None query_arg = environ["parsed_params"].get("query_arg", [0])[0] or 0 q_start = int( environ["parsed_params"].get('q_start', [0])[0] ) or 1 # better checking. this doesn't catch 0...which helps for now. q_end = int(environ["parsed_params"].get('q_end', [ 0 ])[0]) or q_start + 49 # fine. python lists are exclusive at the end. width = int(environ["parsed_params"].get("width", [0])[0]) or 400 status = "running query for %s @ %s with arg %s" % (qs, dbname, query_arg) metadata_fields = db.locals["metadata_fields"] query_metadata = {} for meta_f in metadata_fields: if meta_f in environ["parsed_params"]: query_metadata[meta_f] = environ["parsed_params"][meta_f][0] content = "" q = db.query(qs, query_method, query_arg, **query_metadata) while not q.done: time.sleep(.05) q.update() l = len(q) if q_end > l: q_end = l status += ". displaying %d - %d of %d hits." % (q_start, q_end, l) yield ("<html>") yield ("<head>") yield ("<title>%s: search for %s</title>" % (dbname, qs)) #opensearch metadata in meta tags here. #total length, start index, hits per page. yield ("</head>") yield ("<body>") yield ("<div class='philologic_concordance'>") yield ("<p class='description'>%s</p>" % status) if l > 0: yield "<ol start='%d'>" % q_start for hit in q[q_start - 1:q_end]: doc_id = q.get_doc(hit) offsets = q.get_bytes(hit) offsets.sort() first_offset = offsets[0] doc = db.toms[doc_id] div1 = db.toms[hit[:2]] div2 = db.toms[hit[:3]] filename = doc["filename"] url = hit_to_link(db, hit) yield "<li class='philologic_occurence'>\n" yield db.locals["make_cite"](db, hit, url) yield ("</a>\n") conc_start = first_offset - width if conc_start < 0: conc_start = 0 text_path = dbpath + "/TEXT/" + filename text_file = open(text_path) text_file.seek(conc_start) text = text_file.read(width * 2) context = format_stream(text, conc_start, offsets) yield (context) yield ("</li>\n") yield ("</ol>") pages = l / 50 + 1 more = "" yield ("<div class='more'>") prev_off = 1 next_off = min(prev_off + 49, l) p_count = 0 while True: new_uri = make_link(qs, query_method, query_arg, start=prev_off, end=next_off, **query_metadata) yield "<a href='%s'>%d-%d</a> " % (new_uri, prev_off, next_off) if prev_off < q_start: yield "... " prev_off = q_start + 50 else: prev_off += 50 next_off = min(prev_off + 49, l) p_count += 1 if p_count > 10: break if next_off == l: break last_page = 50 * (l // 50) + 1 if prev_off <= last_page: if prev_off < last_page: yield "... " prev_off = last_page next_off = l new_uri = make_link(qs, query_method, query_arg, start=prev_off, end=next_off, **query_metadata) yield "<a href='%s'>%d-%d</a> " % (new_uri, prev_off, next_off) yield ("</div>") yield ("</div>") yield ("</body>") yield ("</html>")
def simple_dispatch(environ, start_response): status = '200 OK' # HTTP Status headers = [('Content-type', 'text/html')] # HTTP Headers start_response(status, headers) environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],keep_blank_values=True) # a wsgi app is supposed to return an iterable; # yielding lets you stream, rather than generate everything at once. # yield repr(environ) dbname = shift_path_info(environ) myname = environ["SCRIPT_NAME"] if not dbname: return dbpath = "/var/lib/philologic/databases/" + dbname db = PhiloDB(dbpath,7) toms = db.toms obj = [] count = 0 corpus_file = None corpus_size = 7 corpus_count = 0 if "meta1field" in environ["parsed_params"] and environ["parsed_params"]["meta1field"][0] and "meta1" in environ["parsed_params"] and environ["parsed_params"]["meta1"][0]: c_field = environ["parsed_params"]["meta1field"][0] c_value = environ["parsed_params"]["meta1"][0] meta_dict = {c_field:c_value} else: meta_dict = {} if "query" in environ["parsed_params"]: qs = environ["parsed_params"]["query"][0] q_start = int(environ["parsed_params"].get('q_start',[0])[0]) or 0 q_end = int(environ["parsed_params"].get('q_end',[0])[0]) or q_start + 50 f = DirtyFormatter.Formatter({}) yield "<html><body><div class='conc_response'>running query for '%s'<br/>" % qs print >> environ["wsgi.errors"], str(corpus_file) q = db.query(qs,**meta_dict) while len(q) <= q_end and not q.done: time.sleep(.05) q.update() l = len(q) yield "<div class='status' n=%d%s>%d hits</div>" % (l," done='true'" if q.done else "", l) yield "<div class='results'>" for hit in q[q_start:q_end]: context_file = "" context_file_end = 0 rstr = "" yield "<div class='hit' id='%s' offset=%d>" % ("/".join(str(x) for x in hit[:6]),hit[6]) #get the metadata for all unique parents of the hit. for i in range(1,len(hit)): hit_parent = hit[:i] if hit[i-1] == 0: continue if hit_parent in toms: parent = toms[hit_parent] rstr += format_object(parent,"") if parent["filename"]: context_file = parent["filename"] context_file_end = parent["end"] yield "<span class='cite'>" + rstr + "</span>" path = dbpath + "/TEXT/" + context_file offset = hit[6] #dangerous... (left,word,right) = get_raw_context(path,offset,context_file_end,250) (left,word,right) = [f.format(x) for x in (left,word,right)] content = "<span class='left'>%s</span> <span class='word'>%s</span> <span class='right'>%s</right>" % (left,word,right) yield "<span class='content'>%s</span>" % content yield "</div>" count += 1 yield "</div>%d hits" % count yield "</div></body></html>"
def colloc_filter_service(environ, start_response): status = '200 OK' # HTTP Status headers = [('Content-type', 'text/html; charset=UTF-8')] # HTTP Headers start_response(status, headers) environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"], keep_blank_values=True) # a wsgi app is supposed to return an iterable; # yielding lets you stream, rather than generate everything at once. if "philologic_dbname" in environ: dbname = environ["philologic_dbname"] else: dbname = environ["parsed_params"]["philologic_dbname"][0] myname = environ["SCRIPT_NAME"] dbpath = "/var/lib/philologic/databases/" + dbname db = PhiloDB(dbpath, 7) obj = [] count = 0 corpus_file = None corpus_size = 7 corpus_count = 0 qs = environ["parsed_params"]["query"][0] q_start = int(environ["parsed_params"].get('q_start', [0])[0]) or 1 q_end = int(environ["parsed_params"].get('q_end', [0])[0]) or q_start + 49 width = int(environ["parsed_params"].get("width", [0])[0]) or 100 query_method = environ["parsed_params"].get("query_method", [0])[0] or None query_arg = environ["parsed_params"].get("query_arg", [0])[0] or 0 q_title = environ["parsed_params"].get("title", [""])[0] or "" q_author = environ["parsed_params"].get("author", [""])[0] or "" filter_word = environ["parsed_params"]["colloc_filter"][0] status = "running query for %s @ %s: " % (qs, dbname) metadata = {} if q_author: metadata["author"] = q_author if q_title: metadata["title"] = q_title metadata_fields = ["author", "title", "date"] query_metadata = {} for meta_f in metadata_fields: if meta_f in environ["parsed_params"]: query_metadata[meta_f] = environ["parsed_params"][meta_f][0] content = "" q = db.query(qs, query_method, query_arg, **query_metadata) while len(q) <= q_end and not q.done: time.sleep(.05) q.update() l = len(q) if q_end > l: q_end = l status += "%d total hits. filtering for '%s'." % (l, filter_word) last_doc = -1 collocates = {} text_file = None yield ("<html>\n") yield ("<head>") yield ( "<title>%s: collocation results %d to %d for \"%s\" filtered by \"%s\"</title>" % (dbname, q_start, q_end, qs, filter_word)) #opensearch metadata in meta tags here. #total length, start index, hits per page. yield ("</head>\n") yield ("<body>\n") yield "<div class='philologic_concordance'>" yield ("<p class='description'>%s</p>\n" % status) if l > 0: f_count = 0 yield "<ol start='%d'>\n" % q_start for hit in q: doc_id = q.get_doc(hit) offsets = q.get_bytes(hit) offsets.sort() first_offset = offsets[0] filename = db.toms[doc_id]["filename"] author = db.toms[doc_id]["author"] title = db.toms[doc_id]["title"] date = db.toms[doc_id]["date"] url_comps = [str(doc_id)] for id_comp in hit[1:]: id_comp = str(id_comp) if url_comps + [id_comp] in db.toms: url_comps += [id_comp] url = "/".join(["."] + url_comps) conc_start = first_offset - 100 if conc_start < 0: conc_start = 0 if doc_id > last_doc: filename = db.toms[doc_id]["filename"] last_doc = doc_id text_path = dbpath + "/TEXT/" + filename text_file = open(text_path) text_file.seek(conc_start) text = text_file.read(width * 2) #trim the text need_l_trim = re.search("^[^<]*>", text) if need_l_trim: l_trim_off = need_l_trim.end(0) text = text[l_trim_off:] else: l_trim_off = 0 need_r_trim = re.search("<[^>]*$", text) if need_r_trim: r_trim_off = need_r_trim.start(0) text = text[:r_trim_off] else: r_trim_off = 0 conc_start += l_trim_off if filter_word in [ t.group(2) for t in list(re.finditer(r"(<[^>]+>)|(\w+)", text))[1:-1] ]: # then we have a virtual "hit": f_count += 1 if f_count < q_start: pass elif f_count > q_end: break else: yield "<li class='philologic_occurence'>" yield "<a href='%s' class='philologic_cite'>" % url # should be a link yield "<span class='philologic_property' title='author'>%s</span>, " % author yield "<span class='philologic_property' title='title'>%s</span>: " % title yield "<span class='philologic_property' title='date'>(%s)</span>" % date yield "</a>\n" p_start = conc_start - len( "<div class='philologic_context'>...") + l_trim_off parser = shlaxtree.TokenizingParser(p_start, offsets) parser.feed("<div class='philologic_context'>..." + text + "...</div>\n") tree = parser.close() transform(tree) context = shlaxtree.ElementTree.tostring(tree, encoding='utf-8') yield context yield "</li>\n" yield "</ol>" query_metadata["colloc_filter"] = filter_word next_url = make_link(qs, query_method, query_arg, q_end + 1, q_end + 50, **query_metadata) yield "<a href='%s'>more</a>" % next_url yield "</div>" yield ("</body>") yield ("</html>")
def colloc_service(environ,start_response): status = '200 OK' # HTTP Status headers = [('Content-type', 'text/html; charset=UTF-8')] # HTTP Headers start_response(status, headers) environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],keep_blank_values=True) # a wsgi app is supposed to return an iterable; # yielding lets you stream, rather than generate everything at once. if "philologic_dbname" in environ: dbname = environ["philologic_dbname"] else: dbname = environ["parsed_params"]["philologic_dbname"][0] myname = environ["SCRIPT_NAME"] dbpath = "/var/lib/philologic/databases/" + dbname db = PhiloDB(dbpath,7) obj = [] count = 0 corpus_file = None corpus_size = 7 corpus_count = 0 qs = environ["parsed_params"]["query"][0] q_start = int(environ["parsed_params"].get('q_start',[0])[0]) or 0 q_end = int(environ["parsed_params"].get('q_end',[0])[0]) or q_start + 50 width = int(environ["parsed_params"].get("width",[0])[0]) or 100 q_title = environ["parsed_params"].get("title",[""])[0] or "" q_author = environ["parsed_params"].get("author",[""])[0] or "" status = "running query for %s @ %s: " % (qs,dbname) metadata = {} if q_author: metadata["author"] = q_author if q_title: metadata["title"] = q_title content = "" q = db.query(qs,**metadata) while len(q) <= q_end and not q.done: time.sleep(.05) q.update() l = len(q) if q_end > l: q_end = l status += "%d total hits. aggregating." % (l) last_doc = -1 collocates = {} text_file = None yield("<html>\n") yield("<head>") yield("<title>%s: collocation table for \"%s\"</title>" % (dbname, qs)) #opensearch metadata in meta tags here. #total length, start index, hits per page. yield("</head>\n") yield("<body>\n") yield("<p class='description'>%s</p>\n" % status) if l > 0: for hit in q: doc_id = q.get_doc(hit) offsets = q.get_bytes(hit) first_offset = offsets[0] conc_start = first_offset - 100 if conc_start < 0: conc_start = 0 if doc_id > last_doc: filename = db.toms[doc_id]["filename"] last_doc = doc_id text_path = dbpath + "/TEXT/" + filename text_file = open(text_path) text_file.seek(conc_start) text = text_file.read(width * 2) #trim the text need_l_trim = re.search("^[^<]*>",text) if need_l_trim: l_trim_off = need_l_trim.end(0) text = text[l_trim_off:] else: l_trim_off = 0 need_r_trim = re.search("<[^>]*$",text) if need_r_trim: r_trim_off = need_r_trim.start(0) text = text[:r_trim_off] else: r_trim_off = 0 conc_start += l_trim_off for token in list(re.finditer(r"(<[^>]+>)|(\w+)",text))[1:-1]: if token.group(2): t_off = token.start(2) + conc_start if t_off not in offsets: t_type = token.group(2) if t_type in collocates: collocates[t_type] += 1 else: collocates[t_type] = 1 yield("<table class='philologic_collocation'>\n") for n,f in sorted(collocates.items(),key=lambda x:x[1],reverse=True): url = "./?query=%s&author=%s&title=%s&colloc_filter=%s" % (qs,q_author,q_title,n) yield(" <tr class='philologic_collocation_row'>\n") yield(" <td class='philologic_collocation_key'><a href='%s'>%s</a></td>\n" % (url,n)) yield(" <td class='philologic_collocation_value'>%s</td>\n" % f) yield(" </tr>\n") yield("</table>\n") yield("</body>") yield("</html>")
def conc_service(environ, start_response): status = '200 OK' # HTTP Status headers = [('Content-type', 'text/html; charset=UTF-8')] # HTTP Headers start_response(status, headers) environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],keep_blank_values=True) # a wsgi app is supposed to return an iterable; # yielding lets you stream, rather than generate everything at once. if "philologic_dbname" in environ: dbname = environ["philologic_dbname"] else: dbname = environ["parsed_params"]["philologic_dbname"][0] myname = environ["SCRIPT_NAME"] dbpath = "/var/lib/philologic/databases/" + dbname db = PhiloDB(dbpath,7) obj = [] count = 0 corpus_file = None corpus_size = 7 corpus_count = 0 qs = environ["parsed_params"]["query"][0] query_method = environ["parsed_params"].get("query_method",[0])[0] or None query_arg = environ["parsed_params"].get("query_arg",[0])[0] or 0 q_start = int(environ["parsed_params"].get('q_start',[0])[0]) or 1 # better checking. this doesn't catch 0...which helps for now. q_end = int(environ["parsed_params"].get('q_end',[0])[0]) or q_start + 49 # fine. python lists are exclusive at the end. width = int(environ["parsed_params"].get("width",[0])[0]) or 100 status = "running query for %s @ %s with arg %s" % (qs,dbname,query_arg) metadata_fields = ["author","title","date"] query_metadata = {} for meta_f in metadata_fields: if meta_f in environ["parsed_params"]: query_metadata[meta_f] = environ["parsed_params"][meta_f][0] content = "" q = db.query(qs,query_method,query_arg,**query_metadata) while len(q) <= q_end and not q.done: time.sleep(.05) q.update() l = len(q) if q_end > l: q_end = l status += ". displaying %d - %d of %d hits." % (q_start,q_end,l) yield("<html>") yield("<head>") yield("<title>%s: search for %s</title>" % (dbname, qs)) #opensearch metadata in meta tags here. #total length, start index, hits per page. yield("</head>") yield("<body>") yield("<div class='philologic_concordance'>") yield("<p class='description'>%s</p>" % status) if l > 0: yield "<ol start='%d'>" % q_start for hit in q[q_start-1:q_end]: doc_id = q.get_doc(hit) offsets = q.get_bytes(hit) offsets.sort() first_offset = offsets[0] filename = db.toms[doc_id]["filename"] author = db.toms[doc_id]["author"] title = db.toms[doc_id]["title"] date = db.toms[doc_id]["date"] url = hit_to_link(db,hit) yield "<li class='philologic_occurence'>\n" yield "<a class='philologic_cite' href='%s'>" % url yield("<span class='philologic_property' title='author'>%s</span>, " % author) yield("<span class='philologic_property' title='title'>%s</span> " % title) yield("<span class='philologic_property' title='date'>(%s)</span>" % date) yield("</a>\n") conc_start = first_offset - width if conc_start < 0: conc_start = 0 text_path = dbpath + "/TEXT/" + filename text_file = open(text_path) text_file.seek(conc_start) text = text_file.read(width * 2) #trim the text need_l_trim = re.search("^[^<]*>",text) if need_l_trim: l_trim_off = need_l_trim.end(0) text = text[l_trim_off:] else: l_trim_off = 0 need_r_trim = re.search("<[^>]*$",text) if need_r_trim: r_trim_off = need_r_trim.start(0) text = text[:r_trim_off] else: r_trim_off = 0 p_start = conc_start - len("<div class='philologic_context'>...") + l_trim_off parser = shlaxtree.TokenizingParser(p_start,offsets) parser.feed("<div class='philologic_context'>..." + text + "...</div>") tree = parser.close() transform(tree) context = shlaxtree.ElementTree.tostring(tree,encoding='utf-8') yield(context) yield("</li>\n") yield("</ol>") pages = l / 50 + 1 more = "" yield("<div class='more'>") prev_off = 1 next_off = min(prev_off + 49,l) p_count = 0 while True: new_uri = make_link(qs,query_method,query_arg,start=prev_off,end=next_off,**query_metadata) yield "<a href='%s'>%d-%d</a> " % (new_uri,prev_off,next_off) if prev_off < q_start: yield "... " prev_off = q_start + 50 else: prev_off += 50 next_off = min(prev_off + 49,l) p_count += 1 if p_count > 10: break if next_off == l: break last_page = 50 * (l // 50) + 1 if prev_off <= last_page: if prev_off < last_page: yield "... " prev_off = last_page next_off = l new_uri = make_link(qs,query_method,query_arg,start=prev_off,end=next_off,**query_metadata) yield "<a href='%s'>%d-%d</a> " % (new_uri,prev_off,next_off) yield("</div>") yield("</div>") yield("</body>") yield("</html>")
def conc_service(environ, start_response): status = '200 OK' # HTTP Status headers = [('Content-type', 'text/html; charset=UTF-8')] # HTTP Headers start_response(status, headers) environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"], keep_blank_values=True) # a wsgi app is supposed to return an iterable; # yielding lets you stream, rather than generate everything at once. if "philologic_dbname" in environ: dbname = environ["philologic_dbname"] else: dbname = environ["parsed_params"]["philologic_dbname"][0] myname = environ["SCRIPT_NAME"] dbpath = "/var/lib/philologic/databases/" + dbname db = PhiloDB(dbpath, 7) obj = [] count = 0 corpus_file = None corpus_size = 7 corpus_count = 0 qs = environ["parsed_params"]["query"][0] query_method = environ["parsed_params"].get("query_method", [0])[0] or None query_arg = environ["parsed_params"].get("query_arg", [0])[0] or 0 q_start = int( environ["parsed_params"].get('q_start', [0])[0] ) or 1 # better checking. this doesn't catch 0...which helps for now. q_end = int(environ["parsed_params"].get('q_end', [ 0 ])[0]) or q_start + 49 # fine. python lists are exclusive at the end. width = int(environ["parsed_params"].get("width", [0])[0]) or 100 status = "running query for %s @ %s with arg %s" % (qs, dbname, query_arg) metadata_fields = ["author", "title", "date"] query_metadata = {} for meta_f in metadata_fields: if meta_f in environ["parsed_params"]: query_metadata[meta_f] = environ["parsed_params"][meta_f][0] content = "" q = db.query(qs, query_method, query_arg, **query_metadata) while len(q) <= q_end and not q.done: time.sleep(.05) q.update() l = len(q) if q_end > l: q_end = l status += ". displaying %d - %d of %d hits." % (q_start, q_end, l) yield ("<html>") yield ("<head>") yield ("<title>%s: search for %s</title>" % (dbname, qs)) #opensearch metadata in meta tags here. #total length, start index, hits per page. yield ("</head>") yield ("<body>") yield ("<div class='philologic_concordance'>") yield ("<p class='description'>%s</p>" % status) if l > 0: yield "<ol start='%d'>" % q_start for hit in q[q_start - 1:q_end]: doc_id = q.get_doc(hit) offsets = q.get_bytes(hit) offsets.sort() first_offset = offsets[0] filename = db.toms[doc_id]["filename"] author = db.toms[doc_id]["author"] title = db.toms[doc_id]["title"] date = db.toms[doc_id]["date"] url = hit_to_link(db, hit) yield "<li class='philologic_occurence'>\n" yield "<a class='philologic_cite' href='%s'>" % url yield ( "<span class='philologic_property' title='author'>%s</span>, " % author) yield ( "<span class='philologic_property' title='title'>%s</span> " % title) yield ( "<span class='philologic_property' title='date'>(%s)</span>" % date) yield ("</a>\n") conc_start = first_offset - width if conc_start < 0: conc_start = 0 text_path = dbpath + "/TEXT/" + filename text_file = open(text_path) text_file.seek(conc_start) text = text_file.read(width * 2) #trim the text need_l_trim = re.search("^[^<]*>", text) if need_l_trim: l_trim_off = need_l_trim.end(0) text = text[l_trim_off:] else: l_trim_off = 0 need_r_trim = re.search("<[^>]*$", text) if need_r_trim: r_trim_off = need_r_trim.start(0) text = text[:r_trim_off] else: r_trim_off = 0 p_start = conc_start - len( "<div class='philologic_context'>...") + l_trim_off parser = shlaxtree.TokenizingParser(p_start, offsets) parser.feed("<div class='philologic_context'>..." + text + "...</div>") tree = parser.close() transform(tree) context = shlaxtree.ElementTree.tostring(tree, encoding='utf-8') yield (context) yield ("</li>\n") yield ("</ol>") pages = l / 50 + 1 more = "" yield ("<div class='more'>") prev_off = 1 next_off = min(prev_off + 49, l) p_count = 0 while True: new_uri = make_link(qs, query_method, query_arg, start=prev_off, end=next_off, **query_metadata) yield "<a href='%s'>%d-%d</a> " % (new_uri, prev_off, next_off) if prev_off < q_start: yield "... " prev_off = q_start + 50 else: prev_off += 50 next_off = min(prev_off + 49, l) p_count += 1 if p_count > 10: break if next_off == l: break last_page = 50 * (l // 50) + 1 if prev_off <= last_page: if prev_off < last_page: yield "... " prev_off = last_page next_off = l new_uri = make_link(qs, query_method, query_arg, start=prev_off, end=next_off, **query_metadata) yield "<a href='%s'>%d-%d</a> " % (new_uri, prev_off, next_off) yield ("</div>") yield ("</div>") yield ("</body>") yield ("</html>")
def colloc_json_service(environ,start_response): status = '200 OK' # HTTP Status headers = [('Content-type', 'application/json; charset=UTF-8')] # HTTP Headers start_response(status, headers) environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],keep_blank_values=True) # a wsgi app is supposed to return an iterable; # yielding lets you stream, rather than generate everything at once. if "philologic_dbname" in environ: dbname = environ["philologic_dbname"] else: dbname = environ["parsed_params"]["philologic_dbname"][0] myname = environ["SCRIPT_NAME"] dbpath = "/var/lib/philologic/databases/" + dbname db = PhiloDB(dbpath,7) obj = [] count = 0 corpus_file = None corpus_size = 7 corpus_count = 0 qs = environ["parsed_params"]["query"][0] query_method = environ["parsed_params"].get("query_method",[0])[0] or None query_arg = environ["parsed_params"].get("query_arg",[0])[0] or 0 q_start = int(environ["parsed_params"].get('q_start',[0])[0]) or 1 q_end = int(environ["parsed_params"].get('q_end',[0])[0]) or q_start + 499 width = int(environ["parsed_params"].get("width",[0])[0]) or 100 status = "running query for %s @ %s: " % (qs,dbname) metadata_fields = ["author","title","date"] query_metadata = {} for meta_f in metadata_fields: if meta_f in environ["parsed_params"]: query_metadata[meta_f] = environ["parsed_params"][meta_f][0] print >> sys.stderr, query_metadata content = "" q = db.query(qs,query_method,query_arg,**query_metadata) while len(q) <= q_end and not q.done: time.sleep(.05) q.update() l = len(q) if q_end > l: q_end = l print >> sys.stderr, "%d total hits. aggregating." % (l) last_doc = -1 collocates = {} text_file = None if l > 0: for hit in q[q_start - 1:q_end]: doc_id = q.get_doc(hit) offsets = q.get_bytes(hit) offsets.reverse() first_offset = offsets[0] conc_start = first_offset - 100 if conc_start < 0: conc_start = 0 if doc_id > last_doc: filename = db.toms[doc_id]["filename"] last_doc = doc_id text_path = dbpath + "/TEXT/" + filename text_file = open(text_path) text_file.seek(conc_start) text = text_file.read(width * 2) #trim the text need_l_trim = re.search("^[^<]*>",text) if need_l_trim: l_trim_off = need_l_trim.end(0) text = text[l_trim_off:] else: l_trim_off = 0 need_r_trim = re.search("<[^>]*$",text) if need_r_trim: r_trim_off = need_r_trim.start(0) text = text[:r_trim_off] else: r_trim_off = 0 conc_start += l_trim_off for token in list(re.finditer(r"(<[^>]+>)|(\w+)",text))[1:-1]: if token.group(2): t_off = token.start(2) + conc_start if t_off not in offsets: t_type = token.group(2) if t_type in collocates: collocates[t_type] += 1 else: collocates[t_type] = 1 results = [] total_words = 0 page_width = q_end - q_start + 1 for n,f in sorted(collocates.items(),key=lambda x:x[1],reverse=True): if f > 5: # UGLY!!! filter_metadata = dict(query_metadata.items()) filter_metadata["colloc_filter"] = n url = make_link(qs,query_method,query_arg,**filter_metadata) results.append({"label":n,"count":f,"url":url,"rate":float(f)/l}) total_words += f pages = [] page_width = q_end - q_start + 1 for p_start in range(q_end + 1,l,page_width): page_metadata = dict(query_metadata.items()) page_metadata["field"] = "collocates" page_metadata["format"] = "json" page_metadata["report"] = "frequency" p_end = min(l,p_start + page_width - 1) p_url = make_link(qs,query_method,query_arg,start=p_start,end=p_end,**page_metadata) pages.append(p_url) wrapper = {"result":results,"remaining_pages":pages,"length":l,"q_start":q_start,"q_end":q_end,"total_words":total_words,"field":"word"} yield json.dumps(wrapper,indent=1)
def conc_service(environ, start_response): status = '200 OK' # HTTP Status headers = [('Content-type', 'text/html; charset=UTF-8')] # HTTP Headers start_response(status, headers) environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"],keep_blank_values=True) # a wsgi app is supposed to return an iterable; # yielding lets you stream, rather than generate everything at once. if "philologic_dbname" in environ: dbname = environ["philologic_dbname"] else: dbname = environ["parsed_params"]["philologic_dbname"][0] myname = environ["SCRIPT_NAME"] dbpath = "/var/lib/philologic/databases/" + dbname db = PhiloDB(dbpath,7) obj = [] count = 0 corpus_file = None corpus_size = 7 corpus_count = 0 qs = environ["parsed_params"]["query"][0] query_method = environ["parsed_params"].get("query_method",[0])[0] or None query_arg = environ["parsed_params"].get("query_arg",[0])[0] or 0 q_start = int(environ["parsed_params"].get('q_start',[0])[0]) or 1 # better checking. this doesn't catch 0...which helps for now. q_end = int(environ["parsed_params"].get('q_end',[0])[0]) or q_start + 49 # fine. python lists are exclusive at the end. width = int(environ["parsed_params"].get("width",[0])[0]) or 400 status = "running query for %s @ %s with arg %s" % (qs,dbname,query_arg) metadata_fields = db.locals["metadata_fields"] query_metadata = {} for meta_f in metadata_fields: if meta_f in environ["parsed_params"]: query_metadata[meta_f] = environ["parsed_params"][meta_f][0] content = "" q = db.query(qs,query_method,query_arg,**query_metadata) while not q.done: time.sleep(.05) q.update() l = len(q) if q_end > l: q_end = l status += ". displaying %d - %d of %d hits." % (q_start,q_end,l) yield("<html>") yield("<head>") yield("<title>%s: search for %s</title>" % (dbname, qs)) #opensearch metadata in meta tags here. #total length, start index, hits per page. yield("</head>") yield("<body>") yield("<div class='philologic_concordance'>") yield("<p class='description'>%s</p>" % status) if l > 0: yield "<ol start='%d'>" % q_start for hit in q[q_start-1:q_end]: doc_id = q.get_doc(hit) offsets = q.get_bytes(hit) offsets.sort() first_offset = offsets[0] doc = db.toms[doc_id] div1 = db.toms[hit[:2]] div2 = db.toms[hit[:3]] filename = doc["filename"] url = hit_to_link(db,hit) yield "<li class='philologic_occurence'>\n" yield db.locals["make_cite"](db,hit,url) yield("</a>\n") conc_start = first_offset - width if conc_start < 0: conc_start = 0 text_path = dbpath + "/TEXT/" + filename text_file = open(text_path) text_file.seek(conc_start) text = text_file.read(width * 2) context = format_stream(text,conc_start,offsets) yield(context) yield("</li>\n") yield("</ol>") pages = l / 50 + 1 more = "" yield("<div class='more'>") prev_off = 1 next_off = min(prev_off + 49,l) p_count = 0 while True: new_uri = make_link(qs,query_method,query_arg,start=prev_off,end=next_off,**query_metadata) yield "<a href='%s'>%d-%d</a> " % (new_uri,prev_off,next_off) if prev_off < q_start: yield "... " prev_off = q_start + 50 else: prev_off += 50 next_off = min(prev_off + 49,l) p_count += 1 if p_count > 10: break if next_off == l: break last_page = 50 * (l // 50) + 1 if prev_off <= last_page: if prev_off < last_page: yield "... " prev_off = last_page next_off = l new_uri = make_link(qs,query_method,query_arg,start=prev_off,end=next_off,**query_metadata) yield "<a href='%s'>%d-%d</a> " % (new_uri,prev_off,next_off) yield("</div>") yield("</div>") yield("</body>") yield("</html>")
def colloc_json_service(environ, start_response): status = '200 OK' # HTTP Status headers = [('Content-type', 'application/json; charset=UTF-8') ] # HTTP Headers start_response(status, headers) environ["parsed_params"] = urlparse.parse_qs(environ["QUERY_STRING"], keep_blank_values=True) # a wsgi app is supposed to return an iterable; # yielding lets you stream, rather than generate everything at once. if "philologic_dbname" in environ: dbname = environ["philologic_dbname"] else: dbname = environ["parsed_params"]["philologic_dbname"][0] myname = environ["SCRIPT_NAME"] dbpath = "/var/lib/philologic/databases/" + dbname db = PhiloDB(dbpath, 7) obj = [] count = 0 corpus_file = None corpus_size = 7 corpus_count = 0 qs = environ["parsed_params"]["query"][0] query_method = environ["parsed_params"].get("query_method", [0])[0] or None query_arg = environ["parsed_params"].get("query_arg", [0])[0] or 0 q_start = int(environ["parsed_params"].get('q_start', [0])[0]) or 1 q_end = int(environ["parsed_params"].get('q_end', [0])[0]) or q_start + 499 width = int(environ["parsed_params"].get("width", [0])[0]) or 100 status = "running query for %s @ %s: " % (qs, dbname) metadata_fields = ["author", "title", "date"] query_metadata = {} for meta_f in metadata_fields: if meta_f in environ["parsed_params"]: query_metadata[meta_f] = environ["parsed_params"][meta_f][0] print >> sys.stderr, query_metadata content = "" q = db.query(qs, query_method, query_arg, **query_metadata) while len(q) <= q_end and not q.done: time.sleep(.05) q.update() l = len(q) if q_end > l: q_end = l print >> sys.stderr, "%d total hits. aggregating." % (l) last_doc = -1 collocates = {} text_file = None if l > 0: for hit in q[q_start - 1:q_end]: doc_id = q.get_doc(hit) offsets = q.get_bytes(hit) offsets.reverse() first_offset = offsets[0] conc_start = first_offset - 100 if conc_start < 0: conc_start = 0 if doc_id > last_doc: filename = db.toms[doc_id]["filename"] last_doc = doc_id text_path = dbpath + "/TEXT/" + filename text_file = open(text_path) text_file.seek(conc_start) text = text_file.read(width * 2) #trim the text need_l_trim = re.search("^[^<]*>", text) if need_l_trim: l_trim_off = need_l_trim.end(0) text = text[l_trim_off:] else: l_trim_off = 0 need_r_trim = re.search("<[^>]*$", text) if need_r_trim: r_trim_off = need_r_trim.start(0) text = text[:r_trim_off] else: r_trim_off = 0 conc_start += l_trim_off for token in list(re.finditer(r"(<[^>]+>)|(\w+)", text))[1:-1]: if token.group(2): t_off = token.start(2) + conc_start if t_off not in offsets: t_type = token.group(2) if t_type in collocates: collocates[t_type] += 1 else: collocates[t_type] = 1 results = [] total_words = 0 page_width = q_end - q_start + 1 for n, f in sorted(collocates.items(), key=lambda x: x[1], reverse=True): if f > 5: # UGLY!!! filter_metadata = dict(query_metadata.items()) filter_metadata["colloc_filter"] = n url = make_link(qs, query_method, query_arg, **filter_metadata) results.append({ "label": n, "count": f, "url": url, "rate": float(f) / l }) total_words += f pages = [] page_width = q_end - q_start + 1 for p_start in range(q_end + 1, l, page_width): page_metadata = dict(query_metadata.items()) page_metadata["field"] = "collocates" page_metadata["format"] = "json" page_metadata["report"] = "frequency" p_end = min(l, p_start + page_width - 1) p_url = make_link(qs, query_method, query_arg, start=p_start, end=p_end, **page_metadata) pages.append(p_url) wrapper = { "result": results, "remaining_pages": pages, "length": l, "q_start": q_start, "q_end": q_end, "total_words": total_words, "field": "word" } yield json.dumps(wrapper, indent=1)