Exemple #1
0
def render(msg):
    msg = cStringIO.StringIO(msg)
    query_msg = decode_netstring_fd(msg)
    score_msg = decode_netstring_fd(msg)
    rank_msg = decode_netstring_fd(msg)

    site_check = True
    show_score = False

    try:
        query = make_query.parse_qs(query_msg['query'])
        if 'offs' in query:
            offs = int(query['offs'])
        else:
            offs = 0
        query_q = query['q']

        if query_msg['mods']:
            if query_msg['mods'].startswith("site"):
                site_check = False
            elif query_msg['mods'].startswith("score"):
                show_score = True

    except Exception, x:
        erlay.report("Invalid or missing query string: %s" %
                     query_msg['query'])
        traceback.print_exc()
        offs = 0
        query_q = ""
Exemple #2
0
def update_normtable():
        global normtable, last_normtable_update, normtables_need_update
	normtables_need_update = False
	erlay.report("Updating normtable")
        normtable = ainodex.normtable_to_judy(
			"".join(normtables.itervalues()))
	last_normtable_update = time.time()
Exemple #3
0
def score(msg):
        ret = {}
        msg = decode_netstring_fd(cStringIO.StringIO(msg))
        #cueset_size, cues = ainodex.expand_cueset(
        #        map(int, msg['cues'].split()))

	cueset_size, cues = ainodex.hits(map(int, msg['cues'].split()), 0)
        cueset_size, cues = filter_hits(cueset_size, cues,\
		prior_check = True, site_check = True)
        ret['cueset_size'] = str(cueset_size)

        ok_layers = [i for i, maxf in enumerate(LAYERS)
                if min(maxf, cueset_size) / float(max(maxf, cueset_size)) 
                                > MIN_SCORE]

        if len(LAYERS) - 1 in ok_layers:
                ok_layers.append(len(LAYERS))
        print "OK", ok_layers, "CUES", cueset_size

        t = time.time()
        
        for i in ok_layers:
                layer = ainodex.new_layer(i, cues)
                ret[str(i)] = ainodex.serialize_layer(layer)
        
        erlay.report("Scoring <%s> took %dms" % 
                (msg['cues'], (time.time() - t) * 1000.0))

        return encode_netstring_fd(ret)
Exemple #4
0
def score(msg):
    ret = {}
    msg = decode_netstring_fd(cStringIO.StringIO(msg))
    #cueset_size, cues = ainodex.expand_cueset(
    #        map(int, msg['cues'].split()))

    cueset_size, cues = ainodex.hits(map(int, msg['cues'].split()), 0)
    cueset_size, cues = filter_hits(cueset_size, cues,\
prior_check = True, site_check = True)
    ret['cueset_size'] = str(cueset_size)

    ok_layers = [
        i for i, maxf in enumerate(LAYERS)
        if min(maxf, cueset_size) / float(max(maxf, cueset_size)) > MIN_SCORE
    ]

    if len(LAYERS) - 1 in ok_layers:
        ok_layers.append(len(LAYERS))
    print "OK", ok_layers, "CUES", cueset_size

    t = time.time()

    for i in ok_layers:
        layer = ainodex.new_layer(i, cues)
        ret[str(i)] = ainodex.serialize_layer(layer)

    erlay.report("Scoring <%s> took %dms" % (msg['cues'],
                                             (time.time() - t) * 1000.0))

    return encode_netstring_fd(ret)
Exemple #5
0
def render(msg):
        msg = cStringIO.StringIO(msg)
        query_msg = decode_netstring_fd(msg)
        score_msg = decode_netstring_fd(msg)
        rank_msg = decode_netstring_fd(msg)
		
	site_check = True
	show_score = False
        
        try:
                query = make_query.parse_qs(query_msg['query'])
                if 'offs' in query:
                        offs = int(query['offs'])
                else:
                        offs = 0
                query_q = query['q']
		
		if query_msg['mods']:
			if query_msg['mods'].startswith("site"):
				site_check = False
			elif query_msg['mods'].startswith("score"):
				show_score = True

        except Exception, x:
                erlay.report("Invalid or missing query string: %s" %
                        query_msg['query'])
                traceback.print_exc()
                offs = 0
                query_q = ""
Exemple #6
0
def make_snippet(dockey, query, layers, seen_sites, seen_md5s):
	#erlay.report("QEURYE (%s) <%s> <%s>" % (type(query), query, dockey))
        offs, size = map(int, doc_idx[dockey].split())
        doc_db.seek(offs)
        url, title, doc = zlib.decompress(doc_db.read(size)).split('\0')
	title = title.decode("iso-8859-1")
	doc = doc.decode("iso-8859-1")
	site = url2site(url)
	
	if site in seen_sites:
		raise InvalidHit
	seen_sites[site] = True
	
	dig = md5.md5(doc.encode("iso-8859-1", "ignore")).digest()
	if dig in seen_md5s:
		raise InvalidHit
	seen_md5s[dig] = True

	title = title.strip()
	if len(title) > 50 or len(title) == 0:
		title = title[:50] + "..."
	keys = filter(lambda x: x, [re.sub("(?u)\W", "", key) for key in query.split()])
	erlay.report("KEYS %s" % keys)

        unic = re.sub("\\.*?\]", "", doc)
        unic = re.sub("[\[\]\|]", "", unic)
        candidates = re.findall(cand_re, unic)
        scores = []
        
        for sentence in candidates:
                txt = re.sub("(?u)\W", " ", sentence)
		txt = txt.encode("iso-8859-1")
                xidbag = filter(lambda x: x,
                        map(ainodex.token2ixeme, txt.split()))
                s = sum(1.0 for key in keys 
			if re.search("(?ui)\W%s\W" % key, txt))
                s += sum(ainodex.score_list(xidbag, layer)
                        for layer in layers if layer)
                scores.append((s, sentence))
        scores.sort(reverse = True)
        
        snip = ""
        for score, sentence in scores:
                if len(snip) + len(sentence) > SNIP_MAX_LEN:
                        break
                txt = sentence[1:].strip()
		for key in keys:
                        txt = re.sub("(?iu)(\W)(%s)(\W)" % key,
                                r"\1<b>\2</b>\3", txt)
                snip += txt + ".. "
       
	if len(snip) > SNIP_MAX_LEN:
		snip = snip[:SNIP_MAX_LEN] + "..."

        return title, url, snip
Exemple #7
0
def make_snippet(dockey, query, layers, seen_sites, seen_md5s):
    #erlay.report("QEURYE (%s) <%s> <%s>" % (type(query), query, dockey))
    offs, size = map(int, doc_idx[dockey].split())
    doc_db.seek(offs)
    url, title, doc = zlib.decompress(doc_db.read(size)).split('\0')
    title = title.decode("iso-8859-1")
    doc = doc.decode("iso-8859-1")
    site = url2site(url)

    if site in seen_sites:
        raise InvalidHit
    seen_sites[site] = True

    dig = md5.md5(doc.encode("iso-8859-1", "ignore")).digest()
    if dig in seen_md5s:
        raise InvalidHit
    seen_md5s[dig] = True

    title = title.strip()
    if len(title) > 50 or len(title) == 0:
        title = title[:50] + "..."
    keys = filter(lambda x: x,
                  [re.sub("(?u)\W", "", key) for key in query.split()])
    erlay.report("KEYS %s" % keys)

    unic = re.sub("\\.*?\]", "", doc)
    unic = re.sub("[\[\]\|]", "", unic)
    candidates = re.findall(cand_re, unic)
    scores = []

    for sentence in candidates:
        txt = re.sub("(?u)\W", " ", sentence)
        txt = txt.encode("iso-8859-1")
        xidbag = filter(lambda x: x, map(ainodex.token2ixeme, txt.split()))
        s = sum(1.0 for key in keys if re.search("(?ui)\W%s\W" % key, txt))
        s += sum(
            ainodex.score_list(xidbag, layer) for layer in layers if layer)
        scores.append((s, sentence))
    scores.sort(reverse=True)

    snip = ""
    for score, sentence in scores:
        if len(snip) + len(sentence) > SNIP_MAX_LEN:
            break
        txt = sentence[1:].strip()
        for key in keys:
            txt = re.sub("(?iu)(\W)(%s)(\W)" % key, r"\1<b>\2</b>\3", txt)
        snip += txt + ".. "

    if len(snip) > SNIP_MAX_LEN:
        snip = snip[:SNIP_MAX_LEN] + "..."

    return title, url, snip
Exemple #8
0
def merge_ranked(msg):
        msg = cStringIO.StringIO(msg)
        r = cStringIO.StringIO()
        num_hits = 0
        while True:
                try:
                        iblock_ranked = decode_netstring_fd(msg)
                        r.write(iblock_ranked['ranked'])
                        num_hits += int(iblock_ranked['num_hits'])
                except EOFError:
                        break
        top = ainodex.merge_ranked(r.getvalue())
	erlay.report("Top keys: %s" % array.array("I", top)[:20:2])

        return encode_netstring_fd({'merged': top, 'num_hits': str(num_hits)})
Exemple #9
0
def parse_query(msg):
	if normtables_need_update and\
		time.time() - last_normtable_update > NORMTABLE_UPDATE:
		update_normtable()

        query = None
        try:
                query = make_query.parse_qs(msg)
                keys, cues, mods = make_query.make_query(query['q'])
                keys = " ".join(map(str, keys))
                cues = " ".join(map(str, cues))
        	mods = " ".join(mods)
	except Exception, x:
                erlay.report("Query parsing failed: %s" % x)
                if query:
                        erlay.report("Failed query: %s" % query)
                keys = ""
                cues = ""
		mods = ""
                query = ""
Exemple #10
0
def add_normtable(msg):
        global normtables_need_update
	c = time.time()
        msg = cStringIO.StringIO(msg)
        while True:
                try:
                        iblock_normtable = decode_netstring_fd(msg)
                except EOFError:
                        break
                normtables[iblock_normtable['iblock']] =\
                        iblock_normtable['normtable']
	
	erlay.report("This took %dms" % ((time.time() - c) * 1000.0))
	normtables_need_update = True

	if time.time() - last_normtable_update > NORMTABLE_UPDATE:
		update_normtable()

        erlay.report("Got normtables from iblocks <%s>" %
                " ".join(normtables.keys()))
        return "ok"
Exemple #11
0
def rank(msg):
        t = time.time()
        ret = {}
        msg = cStringIO.StringIO(msg)
        query_msg = decode_netstring_fd(msg)
        layer_msg = decode_netstring_fd(msg)
        erlay.report("Rank init took %dms" %\
                        ((time.time() - t) * 1000.0))

        print >> sys.stderr, "QUERY", query_msg

	if query_msg['mods'] and query_msg['mods'].startswith("site:"):
		ok_site = hash(query_msg['mods'][5:])
		print >> sys.stderr, "SHOW SITE", query_msg['mods'], ok_site
	else:
		ok_site = 0

        t = time.time()
	hits_len, hits = ainodex.hits(map(int, query_msg['keys'].split()), 0)
        ret['num_hits'] = str(hits_len)
        hits_len, hits = filter_hits(hits_len, hits,\
		site_check = True, prior_check=True, show_site=ok_site)
        erlay.report("Hits took %dms" %\
                        ((time.time() - t) * 1000.0))
       
       	print "HITS_LEN", hits_len
        t = time.time()
        layers = [None] * 10
        for layer_str in layer_msg.itervalues():
                ainodex.deserialize_layer(layer_str, layers)
        erlay.report("Deser took %dms" %\
                        ((time.time() - t) * 1000.0))

	#kkeys = map(lambda x: ainopy.did2key(ainopy.sid2doc(x)[0]), ainodex.hit_contents(hits))


        t = time.time()
        ret['ranked'] = ainodex.rank(hits, layers) 
	print >> sys.stderr, "RANKED", array.array("I", ret["ranked"])[:20:2]
	#for key in array.array("I", ret["ranked"])[:20:2]:
	#if key not in okkeys:
	#	print >> sys.stderr, "NOT IN OK", key

        print "LL", len(ret['ranked'])
        erlay.report("Ranking <%s><%s> took %dms" % 
                (query_msg['keys'], query_msg['cues'], 
                        (time.time() - t) * 1000.0))

        return encode_netstring_fd(ret)
Exemple #12
0
def rank(msg):
    t = time.time()
    ret = {}
    msg = cStringIO.StringIO(msg)
    query_msg = decode_netstring_fd(msg)
    layer_msg = decode_netstring_fd(msg)
    erlay.report("Rank init took %dms" %\
                    ((time.time() - t) * 1000.0))

    print >> sys.stderr, "QUERY", query_msg

    if query_msg['mods'] and query_msg['mods'].startswith("site:"):
        ok_site = hash(query_msg['mods'][5:])
        print >> sys.stderr, "SHOW SITE", query_msg['mods'], ok_site
    else:
        ok_site = 0

    t = time.time()
    hits_len, hits = ainodex.hits(map(int, query_msg['keys'].split()), 0)
    ret['num_hits'] = str(hits_len)
    hits_len, hits = filter_hits(hits_len, hits,\
site_check = True, prior_check=True, show_site=ok_site)
    erlay.report("Hits took %dms" %\
                    ((time.time() - t) * 1000.0))

    print "HITS_LEN", hits_len
    t = time.time()
    layers = [None] * 10
    for layer_str in layer_msg.itervalues():
        ainodex.deserialize_layer(layer_str, layers)
    erlay.report("Deser took %dms" %\
                    ((time.time() - t) * 1000.0))

    #kkeys = map(lambda x: ainopy.did2key(ainopy.sid2doc(x)[0]), ainodex.hit_contents(hits))

    t = time.time()
    ret['ranked'] = ainodex.rank(hits, layers)
    print >> sys.stderr, "RANKED", array.array("I", ret["ranked"])[:20:2]
    #for key in array.array("I", ret["ranked"])[:20:2]:
    #if key not in okkeys:
    #	print >> sys.stderr, "NOT IN OK", key

    print "LL", len(ret['ranked'])
    erlay.report("Ranking <%s><%s> took %dms" %
                 (query_msg['keys'], query_msg['cues'],
                  (time.time() - t) * 1000.0))

    return encode_netstring_fd(ret)
Exemple #13
0
def merge_scores(msg):
        msg = cStringIO.StringIO(msg)

        layers = [None] * 10
        cueset_size = 0

        while True:
                try:
                        iblock_layers = decode_netstring_fd(msg)
                except EOFError:
                        break

                cueset_size += int(iblock_layers['cueset_size'])
                del iblock_layers['cueset_size']
		
                for layer_data in iblock_layers.itervalues():
                        offs, layer_id, layer =\
                                ainodex.deserialize_layer(
                                        layer_data, layers)

	#XXX: Since ixemes are allocated on different layers on each layer,
	# we must make sure that the ixeme counts match on every layer. This
	# could be easily avoided if ixemes were on the same layers on all
	# iblocks.  This should be easy to fix.
	t = time.time()
	ainodex.sync_layers(layers)
	erlay.report("Syncing layers took %dms" %\
	                   ((time.time() - t) * 1000.0))

        print "CUE", type(cueset_size), cueset_size
        for layer in layers:
                if layer:
                        ainodex.normalize_layer(layer, normtable, cueset_size)

        layers = [(str(i), ainodex.serialize_layer(layer))
                        for i, layer in enumerate(layers) if layer]

        return encode_netstring_fd(dict(layers))
Exemple #14
0
        try:
                query = make_query.parse_qs(msg)
                keys, cues, mods = make_query.make_query(query['q'])
                keys = " ".join(map(str, keys))
                cues = " ".join(map(str, cues))
        	mods = " ".join(mods)
	except Exception, x:
                erlay.report("Query parsing failed: %s" % x)
                if query:
                        erlay.report("Failed query: %s" % query)
                keys = ""
                cues = ""
		mods = ""
                query = ""

        erlay.report("Query string: <%s> Keys: <%s> Cues: <%s>" %\
                (query, keys, cues))
        
        return encode_netstring_fd({'keys': keys, 'cues': cues, 
			'mods': mods, 'query': msg})


def merge_scores(msg):
        msg = cStringIO.StringIO(msg)

        layers = [None] * 10
        cueset_size = 0

        while True:
                try:
                        iblock_layers = decode_netstring_fd(msg)
                except EOFError:
Exemple #15
0
            if not site_check:
                seen_sites = {}

            if valid_hits >= offs:
                site = url2site(url)
                results.append(
                    tuple(
                        map(lambda x: x.encode("iso-8859-1"),
                            (title, url, site, snip))))
            valid_hits += 1
            if len(results) == 10:
                break
        except InvalidHit:
            continue
        except Exception, x:
            erlay.report("Make snippet failed ("
                         "query:<%s> dockey:<%d>)" % (query_q, dockey))
            traceback.print_exc()

    cues = cue_re.findall(query_q)
    keys = map(lambda x: nice_re.sub("", x).strip(),\
     cue_re.split(site_re.sub("", query_q)))

    #erlay.report("CUES <%s>" % cues)
    #erlay.report("KEYS <%s>" % keys)

    if site_check:
        base_q = " ".join(keys) + " ".join(cues)
    else:
        base_q = ""

#erlay.report("BASE <%s>" % base_q)
Exemple #16
0
    t = time.time()
    ret['ranked'] = ainodex.rank(hits, layers)
    print >> sys.stderr, "RANKED", array.array("I", ret["ranked"])[:20:2]
    #for key in array.array("I", ret["ranked"])[:20:2]:
    #if key not in okkeys:
    #	print >> sys.stderr, "NOT IN OK", key

    print "LL", len(ret['ranked'])
    erlay.report("Ranking <%s><%s> took %dms" %
                 (query_msg['keys'], query_msg['cues'],
                  (time.time() - t) * 1000.0))

    return encode_netstring_fd(ret)


erlay.report("--- dex [%s/%d] starts ---" % (NAME, IBLOCK))

ainodex.open()
ainopy.open()

try:
    seg_hashes = map(int, file("%s.%d.seghash" % (NAME, IBLOCK)).readlines())
    erlay.report("Segment hashes read. Duplicate checking enabled.")
except:
    erlay.report("Could not open segment hashes. Duplicate checking disabled.")
    seg_hashes = None

try:
    sites = read_sites()
    erlay.report("Site hashes read. Site checking enabled.")
except:
Exemple #17
0
        ret['ranked'] = ainodex.rank(hits, layers) 
	print >> sys.stderr, "RANKED", array.array("I", ret["ranked"])[:20:2]
	#for key in array.array("I", ret["ranked"])[:20:2]:
	#if key not in okkeys:
	#	print >> sys.stderr, "NOT IN OK", key

        print "LL", len(ret['ranked'])
        erlay.report("Ranking <%s><%s> took %dms" % 
                (query_msg['keys'], query_msg['cues'], 
                        (time.time() - t) * 1000.0))

        return encode_netstring_fd(ret)



erlay.report("--- dex [%s/%d] starts ---" % (NAME, IBLOCK))

ainodex.open()
ainopy.open()

try:
	seg_hashes = map(int, file("%s.%d.seghash" % (NAME, IBLOCK)).readlines())
	erlay.report("Segment hashes read. Duplicate checking enabled.")
except:
	erlay.report("Could not open segment hashes. Duplicate checking disabled.")
	seg_hashes = None

try:
	sites = read_sites()
	erlay.report("Site hashes read. Site checking enabled.")
except:
Exemple #18
0
					seen_sites, seen_md5s)
			if not site_check:
				seen_sites = {}

                        if valid_hits >= offs:
				site = url2site(url)
				results.append(tuple(map(lambda x:
					x.encode("iso-8859-1"), 
						(title, url, site, snip))))
			valid_hits += 1
			if len(results) == 10:
				break
		except InvalidHit:
			continue
                except Exception, x:
                        erlay.report("Make snippet failed ("
                                "query:<%s> dockey:<%d>)" % (query_q, dockey))
                        traceback.print_exc()

        cues = cue_re.findall(query_q)
	keys = map(lambda x: nice_re.sub("", x).strip(),\
		cue_re.split(site_re.sub("", query_q)))

	#erlay.report("CUES <%s>" % cues)
	#erlay.report("KEYS <%s>" % keys)

	if site_check:
		base_q = " ".join(keys) + " ".join(cues)
	else:
		base_q = ""
	
	#erlay.report("BASE <%s>" % base_q)