def score_pods(query, query_dist, query_freqs): '''Score pods for a query''' pod_scores = {} score_sum = 0.0 pods = db.session.query(Pods).filter_by(registered=True).all() for p in pods: DS_score = cosine_similarity(convert_to_array(p.DS_vector), query_dist) term_score, coverage = term_cosine.run(query, query_freqs, p.word_vector) score = DS_score + term_score + 2 * coverage if math.isnan(score): score = 0 pod_scores[p.name] = score score_sum += score print(pod_scores) '''If all scores are rubbish, search entire pod collection (we're desperate!)''' if score_sum < 1: return list(pod_scores.keys()) else: best_pods = [] for k in sorted(pod_scores, key=pod_scores.get, reverse=True): if len(best_pods) < 1: best_pods.append(k) else: break return best_pods
def score(query, query_dist, query_freqs): """ Get distributional score """ DS_scores = {} term_scores = {} coverages = {} for p in db.session.query(Pods).filter_by(registered=False).all(): DS_scores[p.url] = cosine_similarity(convert_to_array(p.DS_vector), query_dist) term_scores[p.url], coverages[p.url] = term_cosine.run(query, query_freqs, p.word_vector) return DS_scores, term_scores
def run(q, d1, d2_s): d2 = convert_string_to_dict(d2_s) dimensions = return_keys(d1, d2) v1 = mk_vector(d1, dimensions) v2 = mk_vector(d2, dimensions) v1_bin = binarise(v1) v2_bin = binarise(v2) coverage = sum(v1_bin * v2_bin) / len(q.split()) d1_vec = normalise(v1) d2_vec = normalise(v2) return cosine_similarity(d1_vec, d2_vec), coverage
def score(query, query_dist, query_freqs, pod): """ Get various scores -- This is slow, slow, slow. Add code for vec to matrix calculations """ DS_scores = {} URL_scores = {} title_scores = {} term_scores = {} coverages = {} #cosines = cosine_to_matrix(query_dist,DS_M) #Code for vec to matrix cosine calculation -- work in progress for u in db.session.query(Urls).filter_by(pod=pod).all(): DS_scores[u.url] = cosine_similarity(convert_to_array(u.vector), query_dist) #DS_scores[u.url] = cosines[url_to_mat[u.url]] URL_scores[u.url] = score_url_overlap(query, u.url) title_scores[u.url] = generic_overlap(query, u.title) term_scores[u.url], coverages[u.url] = term_cosine.run( query, query_freqs, u.freqs) return DS_scores, URL_scores, title_scores, term_scores, coverages
def sim_to_matrix_url(target_url, n): cosines = {} target = convert_to_array(get_db_url_vector(target_url)) for u in Urls.query.all(): cos = cosine_similarity(target, convert_to_array(u.vector)) cosines[u.url] = cos c = 0 neighbours = [] for url in sorted(cosines, key=cosines.get, reverse=True): if c < n: #print(t,cosines[t]) title = Urls.query.filter(Urls.url == url).first().title snippet = Urls.query.filter(Urls.url == url).first().snippet url = Urls.query.filter(Urls.url == url).first().url neighbours.append([url, title, snippet]) c += 1 else: break return neighbours, len(cosines)