def render(msg): msg = cStringIO.StringIO(msg) query_msg = decode_netstring_fd(msg) score_msg = decode_netstring_fd(msg) rank_msg = decode_netstring_fd(msg) site_check = True show_score = False try: query = make_query.parse_qs(query_msg['query']) if 'offs' in query: offs = int(query['offs']) else: offs = 0 query_q = query['q'] if query_msg['mods']: if query_msg['mods'].startswith("site"): site_check = False elif query_msg['mods'].startswith("score"): show_score = True except Exception, x: erlay.report("Invalid or missing query string: %s" % query_msg['query']) traceback.print_exc() offs = 0 query_q = ""
def update_normtable(): global normtable, last_normtable_update, normtables_need_update normtables_need_update = False erlay.report("Updating normtable") normtable = ainodex.normtable_to_judy( "".join(normtables.itervalues())) last_normtable_update = time.time()
def score(msg): ret = {} msg = decode_netstring_fd(cStringIO.StringIO(msg)) #cueset_size, cues = ainodex.expand_cueset( # map(int, msg['cues'].split())) cueset_size, cues = ainodex.hits(map(int, msg['cues'].split()), 0) cueset_size, cues = filter_hits(cueset_size, cues,\ prior_check = True, site_check = True) ret['cueset_size'] = str(cueset_size) ok_layers = [i for i, maxf in enumerate(LAYERS) if min(maxf, cueset_size) / float(max(maxf, cueset_size)) > MIN_SCORE] if len(LAYERS) - 1 in ok_layers: ok_layers.append(len(LAYERS)) print "OK", ok_layers, "CUES", cueset_size t = time.time() for i in ok_layers: layer = ainodex.new_layer(i, cues) ret[str(i)] = ainodex.serialize_layer(layer) erlay.report("Scoring <%s> took %dms" % (msg['cues'], (time.time() - t) * 1000.0)) return encode_netstring_fd(ret)
def score(msg): ret = {} msg = decode_netstring_fd(cStringIO.StringIO(msg)) #cueset_size, cues = ainodex.expand_cueset( # map(int, msg['cues'].split())) cueset_size, cues = ainodex.hits(map(int, msg['cues'].split()), 0) cueset_size, cues = filter_hits(cueset_size, cues,\ prior_check = True, site_check = True) ret['cueset_size'] = str(cueset_size) ok_layers = [ i for i, maxf in enumerate(LAYERS) if min(maxf, cueset_size) / float(max(maxf, cueset_size)) > MIN_SCORE ] if len(LAYERS) - 1 in ok_layers: ok_layers.append(len(LAYERS)) print "OK", ok_layers, "CUES", cueset_size t = time.time() for i in ok_layers: layer = ainodex.new_layer(i, cues) ret[str(i)] = ainodex.serialize_layer(layer) erlay.report("Scoring <%s> took %dms" % (msg['cues'], (time.time() - t) * 1000.0)) return encode_netstring_fd(ret)
def make_snippet(dockey, query, layers, seen_sites, seen_md5s): #erlay.report("QEURYE (%s) <%s> <%s>" % (type(query), query, dockey)) offs, size = map(int, doc_idx[dockey].split()) doc_db.seek(offs) url, title, doc = zlib.decompress(doc_db.read(size)).split('\0') title = title.decode("iso-8859-1") doc = doc.decode("iso-8859-1") site = url2site(url) if site in seen_sites: raise InvalidHit seen_sites[site] = True dig = md5.md5(doc.encode("iso-8859-1", "ignore")).digest() if dig in seen_md5s: raise InvalidHit seen_md5s[dig] = True title = title.strip() if len(title) > 50 or len(title) == 0: title = title[:50] + "..." keys = filter(lambda x: x, [re.sub("(?u)\W", "", key) for key in query.split()]) erlay.report("KEYS %s" % keys) unic = re.sub("\\.*?\]", "", doc) unic = re.sub("[\[\]\|]", "", unic) candidates = re.findall(cand_re, unic) scores = [] for sentence in candidates: txt = re.sub("(?u)\W", " ", sentence) txt = txt.encode("iso-8859-1") xidbag = filter(lambda x: x, map(ainodex.token2ixeme, txt.split())) s = sum(1.0 for key in keys if re.search("(?ui)\W%s\W" % key, txt)) s += sum(ainodex.score_list(xidbag, layer) for layer in layers if layer) scores.append((s, sentence)) scores.sort(reverse = True) snip = "" for score, sentence in scores: if len(snip) + len(sentence) > SNIP_MAX_LEN: break txt = sentence[1:].strip() for key in keys: txt = re.sub("(?iu)(\W)(%s)(\W)" % key, r"\1<b>\2</b>\3", txt) snip += txt + ".. " if len(snip) > SNIP_MAX_LEN: snip = snip[:SNIP_MAX_LEN] + "..." return title, url, snip
def make_snippet(dockey, query, layers, seen_sites, seen_md5s): #erlay.report("QEURYE (%s) <%s> <%s>" % (type(query), query, dockey)) offs, size = map(int, doc_idx[dockey].split()) doc_db.seek(offs) url, title, doc = zlib.decompress(doc_db.read(size)).split('\0') title = title.decode("iso-8859-1") doc = doc.decode("iso-8859-1") site = url2site(url) if site in seen_sites: raise InvalidHit seen_sites[site] = True dig = md5.md5(doc.encode("iso-8859-1", "ignore")).digest() if dig in seen_md5s: raise InvalidHit seen_md5s[dig] = True title = title.strip() if len(title) > 50 or len(title) == 0: title = title[:50] + "..." keys = filter(lambda x: x, [re.sub("(?u)\W", "", key) for key in query.split()]) erlay.report("KEYS %s" % keys) unic = re.sub("\\.*?\]", "", doc) unic = re.sub("[\[\]\|]", "", unic) candidates = re.findall(cand_re, unic) scores = [] for sentence in candidates: txt = re.sub("(?u)\W", " ", sentence) txt = txt.encode("iso-8859-1") xidbag = filter(lambda x: x, map(ainodex.token2ixeme, txt.split())) s = sum(1.0 for key in keys if re.search("(?ui)\W%s\W" % key, txt)) s += sum( ainodex.score_list(xidbag, layer) for layer in layers if layer) scores.append((s, sentence)) scores.sort(reverse=True) snip = "" for score, sentence in scores: if len(snip) + len(sentence) > SNIP_MAX_LEN: break txt = sentence[1:].strip() for key in keys: txt = re.sub("(?iu)(\W)(%s)(\W)" % key, r"\1<b>\2</b>\3", txt) snip += txt + ".. " if len(snip) > SNIP_MAX_LEN: snip = snip[:SNIP_MAX_LEN] + "..." return title, url, snip
def merge_ranked(msg): msg = cStringIO.StringIO(msg) r = cStringIO.StringIO() num_hits = 0 while True: try: iblock_ranked = decode_netstring_fd(msg) r.write(iblock_ranked['ranked']) num_hits += int(iblock_ranked['num_hits']) except EOFError: break top = ainodex.merge_ranked(r.getvalue()) erlay.report("Top keys: %s" % array.array("I", top)[:20:2]) return encode_netstring_fd({'merged': top, 'num_hits': str(num_hits)})
def parse_query(msg): if normtables_need_update and\ time.time() - last_normtable_update > NORMTABLE_UPDATE: update_normtable() query = None try: query = make_query.parse_qs(msg) keys, cues, mods = make_query.make_query(query['q']) keys = " ".join(map(str, keys)) cues = " ".join(map(str, cues)) mods = " ".join(mods) except Exception, x: erlay.report("Query parsing failed: %s" % x) if query: erlay.report("Failed query: %s" % query) keys = "" cues = "" mods = "" query = ""
def add_normtable(msg): global normtables_need_update c = time.time() msg = cStringIO.StringIO(msg) while True: try: iblock_normtable = decode_netstring_fd(msg) except EOFError: break normtables[iblock_normtable['iblock']] =\ iblock_normtable['normtable'] erlay.report("This took %dms" % ((time.time() - c) * 1000.0)) normtables_need_update = True if time.time() - last_normtable_update > NORMTABLE_UPDATE: update_normtable() erlay.report("Got normtables from iblocks <%s>" % " ".join(normtables.keys())) return "ok"
def rank(msg): t = time.time() ret = {} msg = cStringIO.StringIO(msg) query_msg = decode_netstring_fd(msg) layer_msg = decode_netstring_fd(msg) erlay.report("Rank init took %dms" %\ ((time.time() - t) * 1000.0)) print >> sys.stderr, "QUERY", query_msg if query_msg['mods'] and query_msg['mods'].startswith("site:"): ok_site = hash(query_msg['mods'][5:]) print >> sys.stderr, "SHOW SITE", query_msg['mods'], ok_site else: ok_site = 0 t = time.time() hits_len, hits = ainodex.hits(map(int, query_msg['keys'].split()), 0) ret['num_hits'] = str(hits_len) hits_len, hits = filter_hits(hits_len, hits,\ site_check = True, prior_check=True, show_site=ok_site) erlay.report("Hits took %dms" %\ ((time.time() - t) * 1000.0)) print "HITS_LEN", hits_len t = time.time() layers = [None] * 10 for layer_str in layer_msg.itervalues(): ainodex.deserialize_layer(layer_str, layers) erlay.report("Deser took %dms" %\ ((time.time() - t) * 1000.0)) #kkeys = map(lambda x: ainopy.did2key(ainopy.sid2doc(x)[0]), ainodex.hit_contents(hits)) t = time.time() ret['ranked'] = ainodex.rank(hits, layers) print >> sys.stderr, "RANKED", array.array("I", ret["ranked"])[:20:2] #for key in array.array("I", ret["ranked"])[:20:2]: #if key not in okkeys: # print >> sys.stderr, "NOT IN OK", key print "LL", len(ret['ranked']) erlay.report("Ranking <%s><%s> took %dms" % (query_msg['keys'], query_msg['cues'], (time.time() - t) * 1000.0)) return encode_netstring_fd(ret)
def merge_scores(msg): msg = cStringIO.StringIO(msg) layers = [None] * 10 cueset_size = 0 while True: try: iblock_layers = decode_netstring_fd(msg) except EOFError: break cueset_size += int(iblock_layers['cueset_size']) del iblock_layers['cueset_size'] for layer_data in iblock_layers.itervalues(): offs, layer_id, layer =\ ainodex.deserialize_layer( layer_data, layers) #XXX: Since ixemes are allocated on different layers on each layer, # we must make sure that the ixeme counts match on every layer. This # could be easily avoided if ixemes were on the same layers on all # iblocks. This should be easy to fix. t = time.time() ainodex.sync_layers(layers) erlay.report("Syncing layers took %dms" %\ ((time.time() - t) * 1000.0)) print "CUE", type(cueset_size), cueset_size for layer in layers: if layer: ainodex.normalize_layer(layer, normtable, cueset_size) layers = [(str(i), ainodex.serialize_layer(layer)) for i, layer in enumerate(layers) if layer] return encode_netstring_fd(dict(layers))
try: query = make_query.parse_qs(msg) keys, cues, mods = make_query.make_query(query['q']) keys = " ".join(map(str, keys)) cues = " ".join(map(str, cues)) mods = " ".join(mods) except Exception, x: erlay.report("Query parsing failed: %s" % x) if query: erlay.report("Failed query: %s" % query) keys = "" cues = "" mods = "" query = "" erlay.report("Query string: <%s> Keys: <%s> Cues: <%s>" %\ (query, keys, cues)) return encode_netstring_fd({'keys': keys, 'cues': cues, 'mods': mods, 'query': msg}) def merge_scores(msg): msg = cStringIO.StringIO(msg) layers = [None] * 10 cueset_size = 0 while True: try: iblock_layers = decode_netstring_fd(msg) except EOFError:
if not site_check: seen_sites = {} if valid_hits >= offs: site = url2site(url) results.append( tuple( map(lambda x: x.encode("iso-8859-1"), (title, url, site, snip)))) valid_hits += 1 if len(results) == 10: break except InvalidHit: continue except Exception, x: erlay.report("Make snippet failed (" "query:<%s> dockey:<%d>)" % (query_q, dockey)) traceback.print_exc() cues = cue_re.findall(query_q) keys = map(lambda x: nice_re.sub("", x).strip(),\ cue_re.split(site_re.sub("", query_q))) #erlay.report("CUES <%s>" % cues) #erlay.report("KEYS <%s>" % keys) if site_check: base_q = " ".join(keys) + " ".join(cues) else: base_q = "" #erlay.report("BASE <%s>" % base_q)
t = time.time() ret['ranked'] = ainodex.rank(hits, layers) print >> sys.stderr, "RANKED", array.array("I", ret["ranked"])[:20:2] #for key in array.array("I", ret["ranked"])[:20:2]: #if key not in okkeys: # print >> sys.stderr, "NOT IN OK", key print "LL", len(ret['ranked']) erlay.report("Ranking <%s><%s> took %dms" % (query_msg['keys'], query_msg['cues'], (time.time() - t) * 1000.0)) return encode_netstring_fd(ret) erlay.report("--- dex [%s/%d] starts ---" % (NAME, IBLOCK)) ainodex.open() ainopy.open() try: seg_hashes = map(int, file("%s.%d.seghash" % (NAME, IBLOCK)).readlines()) erlay.report("Segment hashes read. Duplicate checking enabled.") except: erlay.report("Could not open segment hashes. Duplicate checking disabled.") seg_hashes = None try: sites = read_sites() erlay.report("Site hashes read. Site checking enabled.") except:
ret['ranked'] = ainodex.rank(hits, layers) print >> sys.stderr, "RANKED", array.array("I", ret["ranked"])[:20:2] #for key in array.array("I", ret["ranked"])[:20:2]: #if key not in okkeys: # print >> sys.stderr, "NOT IN OK", key print "LL", len(ret['ranked']) erlay.report("Ranking <%s><%s> took %dms" % (query_msg['keys'], query_msg['cues'], (time.time() - t) * 1000.0)) return encode_netstring_fd(ret) erlay.report("--- dex [%s/%d] starts ---" % (NAME, IBLOCK)) ainodex.open() ainopy.open() try: seg_hashes = map(int, file("%s.%d.seghash" % (NAME, IBLOCK)).readlines()) erlay.report("Segment hashes read. Duplicate checking enabled.") except: erlay.report("Could not open segment hashes. Duplicate checking disabled.") seg_hashes = None try: sites = read_sites() erlay.report("Site hashes read. Site checking enabled.") except:
seen_sites, seen_md5s) if not site_check: seen_sites = {} if valid_hits >= offs: site = url2site(url) results.append(tuple(map(lambda x: x.encode("iso-8859-1"), (title, url, site, snip)))) valid_hits += 1 if len(results) == 10: break except InvalidHit: continue except Exception, x: erlay.report("Make snippet failed (" "query:<%s> dockey:<%d>)" % (query_q, dockey)) traceback.print_exc() cues = cue_re.findall(query_q) keys = map(lambda x: nice_re.sub("", x).strip(),\ cue_re.split(site_re.sub("", query_q))) #erlay.report("CUES <%s>" % cues) #erlay.report("KEYS <%s>" % keys) if site_check: base_q = " ".join(keys) + " ".join(cues) else: base_q = "" #erlay.report("BASE <%s>" % base_q)