def score(msg): ret = {} msg = decode_netstring_fd(cStringIO.StringIO(msg)) #cueset_size, cues = ainodex.expand_cueset( # map(int, msg['cues'].split())) cueset_size, cues = ainodex.hits(map(int, msg['cues'].split()), 0) cueset_size, cues = filter_hits(cueset_size, cues,\ prior_check = True, site_check = True) ret['cueset_size'] = str(cueset_size) ok_layers = [i for i, maxf in enumerate(LAYERS) if min(maxf, cueset_size) / float(max(maxf, cueset_size)) > MIN_SCORE] if len(LAYERS) - 1 in ok_layers: ok_layers.append(len(LAYERS)) print "OK", ok_layers, "CUES", cueset_size t = time.time() for i in ok_layers: layer = ainodex.new_layer(i, cues) ret[str(i)] = ainodex.serialize_layer(layer) erlay.report("Scoring <%s> took %dms" % (msg['cues'], (time.time() - t) * 1000.0)) return encode_netstring_fd(ret)
def score(msg): ret = {} msg = decode_netstring_fd(cStringIO.StringIO(msg)) #cueset_size, cues = ainodex.expand_cueset( # map(int, msg['cues'].split())) cueset_size, cues = ainodex.hits(map(int, msg['cues'].split()), 0) cueset_size, cues = filter_hits(cueset_size, cues,\ prior_check = True, site_check = True) ret['cueset_size'] = str(cueset_size) ok_layers = [ i for i, maxf in enumerate(LAYERS) if min(maxf, cueset_size) / float(max(maxf, cueset_size)) > MIN_SCORE ] if len(LAYERS) - 1 in ok_layers: ok_layers.append(len(LAYERS)) print "OK", ok_layers, "CUES", cueset_size t = time.time() for i in ok_layers: layer = ainodex.new_layer(i, cues) ret[str(i)] = ainodex.serialize_layer(layer) erlay.report("Scoring <%s> took %dms" % (msg['cues'], (time.time() - t) * 1000.0)) return encode_netstring_fd(ret)
def rank(msg): t = time.time() ret = {} msg = cStringIO.StringIO(msg) query_msg = decode_netstring_fd(msg) layer_msg = decode_netstring_fd(msg) erlay.report("Rank init took %dms" %\ ((time.time() - t) * 1000.0)) print >> sys.stderr, "QUERY", query_msg if query_msg['mods'] and query_msg['mods'].startswith("site:"): ok_site = hash(query_msg['mods'][5:]) print >> sys.stderr, "SHOW SITE", query_msg['mods'], ok_site else: ok_site = 0 t = time.time() hits_len, hits = ainodex.hits(map(int, query_msg['keys'].split()), 0) ret['num_hits'] = str(hits_len) hits_len, hits = filter_hits(hits_len, hits,\ site_check = True, prior_check=True, show_site=ok_site) erlay.report("Hits took %dms" %\ ((time.time() - t) * 1000.0)) print "HITS_LEN", hits_len t = time.time() layers = [None] * 10 for layer_str in layer_msg.itervalues(): ainodex.deserialize_layer(layer_str, layers) erlay.report("Deser took %dms" %\ ((time.time() - t) * 1000.0)) #kkeys = map(lambda x: ainopy.did2key(ainopy.sid2doc(x)[0]), ainodex.hit_contents(hits)) t = time.time() ret['ranked'] = ainodex.rank(hits, layers) print >> sys.stderr, "RANKED", array.array("I", ret["ranked"])[:20:2] #for key in array.array("I", ret["ranked"])[:20:2]: #if key not in okkeys: # print >> sys.stderr, "NOT IN OK", key print "LL", len(ret['ranked']) erlay.report("Ranking <%s><%s> took %dms" % (query_msg['keys'], query_msg['cues'], (time.time() - t) * 1000.0)) return encode_netstring_fd(ret)
import sys, array import ainodex ainodex.open() if len(sys.argv) < 2: print "Usage: simple [key] [cue]" sys.exit(1) keys = ainodex.token2ixeme(sys.argv[1]) cues = ainodex.token2ixeme(sys.argv[2]) print "KEYS", keys, "CUES", cues hits_len, hitset = ainodex.hits([keys], 0) cues_len, cueset = ainodex.hits([cues], 0) print "%s occurs in %d segments" % (sys.argv[1], hits_len) print "%s occurs in %d segments" % (sys.argv[2], cues_len) # Word frequencies normtable = ainodex.normtable_to_judy(ainodex.normtable()) # Compute how many times tokens co-occur with the cueset layers = [ainodex.new_layer(i, cueset) for i in range(10)] for layer in layers: # Compute token scores ainodex.normalize_layer(layer, normtable, cues_len) ranked = ainodex.rank(hitset, layers) doc_keys = array.array("I", ranked)[:20:2] doc_scores = array.array("f", ranked)[1:20:2]