def search(con_name, db_name, threshold, measure, sentence, N, is_feature, features) : #setup stringsDB and freqDB global conn, db, coll if(con_name!="") : conn = pymongo.Connection(con_name) else : conn = pymongo.Connection() db = conn[db_name] coll = db["strings"] coll_freq = db["freq"] #setupt terms if not is_feature : terms = build_stringsdb.create_ngrams(sentence, N) else : terms = features.strip().split("\t") terms_size = len(terms) #sort terms by frequency terms_freq = [] for term in terms : d = coll_freq.find_one({"term":term}) if not d : terms_freq.append([0, term]) else : terms_freq.append([d["freq"], term]) terms_freq.sort(key = lambda x : x[0]) #setupt measure method minY, maxY = set_Y_range(measure, threshold, terms_size) #search matched ids matched_ids = cpmerge(terms_freq, minY, maxY, threshold, measure) #translate ids to stringss similar_stringss = [] for matched_id in matched_ids : for data in coll.find({"_id":matched_id}) : similar_stringss.append(data["strings"]) conn.disconnect() return similar_stringss
def calc_similarity(measure, x, y, N) : X = build_stringsdb.create_ngrams(x, N) Y = build_stringsdb.create_ngrams(y, N) similarity = -1 if measure == "cosine" : similarity = len(set(X)&set(Y)) / math.sqrt(len(X) * len(Y)), set(X)&set(Y) print x,y,similarity