Beispiel #1
0
def search(con_name, db_name, threshold, measure, sentence, N, is_feature, features) :
	#setup stringsDB and freqDB
	global conn, db, coll
	if(con_name!="") : conn = pymongo.Connection(con_name)
	else : conn = pymongo.Connection()
	db = conn[db_name]
	coll = db["strings"]
	coll_freq = db["freq"]
	
	#setupt terms
	if not is_feature :
		terms = build_stringsdb.create_ngrams(sentence, N)
	else : 
		terms = features.strip().split("\t")
	terms_size = len(terms)
	
	#sort terms by frequency
	terms_freq = []
	for term in terms :
		d = coll_freq.find_one({"term":term})
		if not d : 
			terms_freq.append([0, term])
		else :
			terms_freq.append([d["freq"], term])
	terms_freq.sort(key = lambda x : x[0])
	
	#setupt measure method
	minY, maxY = set_Y_range(measure, threshold, terms_size)
	
	#search matched ids
	matched_ids = cpmerge(terms_freq, minY, maxY, threshold, measure)
	
	#translate ids to stringss
	similar_stringss = []
	for matched_id in matched_ids :
		for data in coll.find({"_id":matched_id}) :
			similar_stringss.append(data["strings"])
	conn.disconnect()
	
	return similar_stringss
Beispiel #2
0
def calc_similarity(measure, x, y, N) :
	X = build_stringsdb.create_ngrams(x, N)
	Y = build_stringsdb.create_ngrams(y, N)
	similarity = -1
	if measure == "cosine" : similarity = len(set(X)&set(Y)) / math.sqrt(len(X) * len(Y)), set(X)&set(Y)
	print x,y,similarity