def merge_scores(msg): msg = cStringIO.StringIO(msg) layers = [None] * 10 cueset_size = 0 while True: try: iblock_layers = decode_netstring_fd(msg) except EOFError: break cueset_size += int(iblock_layers['cueset_size']) del iblock_layers['cueset_size'] for layer_data in iblock_layers.itervalues(): offs, layer_id, layer =\ ainodex.deserialize_layer( layer_data, layers) #XXX: Since ixemes are allocated on different layers on each layer, # we must make sure that the ixeme counts match on every layer. This # could be easily avoided if ixemes were on the same layers on all # iblocks. This should be easy to fix. t = time.time() ainodex.sync_layers(layers) erlay.report("Syncing layers took %dms" %\ ((time.time() - t) * 1000.0)) print "CUE", type(cueset_size), cueset_size for layer in layers: if layer: ainodex.normalize_layer(layer, normtable, cueset_size) layers = [(str(i), ainodex.serialize_layer(layer)) for i, layer in enumerate(layers) if layer] return encode_netstring_fd(dict(layers))
ainodex.open() if len(sys.argv) < 2: print "Usage: simple [key] [cue]" sys.exit(1) keys = ainodex.token2ixeme(sys.argv[1]) cues = ainodex.token2ixeme(sys.argv[2]) print "KEYS", keys, "CUES", cues hits_len, hitset = ainodex.hits([keys], 0) cues_len, cueset = ainodex.hits([cues], 0) print "%s occurs in %d segments" % (sys.argv[1], hits_len) print "%s occurs in %d segments" % (sys.argv[2], cues_len) # Word frequencies normtable = ainodex.normtable_to_judy(ainodex.normtable()) # Compute how many times tokens co-occur with the cueset layers = [ainodex.new_layer(i, cueset) for i in range(10)] for layer in layers: # Compute token scores ainodex.normalize_layer(layer, normtable, cues_len) ranked = ainodex.rank(hitset, layers) doc_keys = array.array("I", ranked)[:20:2] doc_scores = array.array("f", ranked)[1:20:2] print zip(doc_keys, doc_scores)
ainodex.open() if len(sys.argv) < 2: print "Usage: simple [key] [cue]" sys.exit(1) keys = ainodex.token2ixeme(sys.argv[1]) cues = ainodex.token2ixeme(sys.argv[2]) print "KEYS", keys, "CUES", cues hits_len, hitset = ainodex.hits([keys], 0) cues_len, cueset = ainodex.hits([cues], 0) print "%s occurs in %d segments" % (sys.argv[1], hits_len) print "%s occurs in %d segments" % (sys.argv[2], cues_len) # Word frequencies normtable = ainodex.normtable_to_judy(ainodex.normtable()) # Compute how many times tokens co-occur with the cueset layers = [ainodex.new_layer(i, cueset) for i in range(10)] for layer in layers: # Compute token scores ainodex.normalize_layer(layer, normtable, cues_len) ranked = ainodex.rank(hitset, layers) doc_keys = array.array("I", ranked)[:20:2] doc_scores = array.array("f", ranked)[1:20:2] print zip(doc_keys, doc_scores)