def run(q, d1, d2_s): d2 = convert_string_to_dict(d2_s) dimensions = return_keys(d1, d2) v1 = mk_vector(d1, dimensions) v2 = mk_vector(d2, dimensions) v1_bin = binarise(v1) v2_bin = binarise(v2) coverage = sum(v1_bin * v2_bin) / len(q.split()) d1_vec = normalise(v1) d2_vec = normalise(v2) return cosine_similarity(d1_vec, d2_vec), coverage
def compute_pod_summary(name): '''This function is very similar to 'self' in PeARS-pod''' DS_vector = np.zeros(400) word_vector = "" freqs = {} for u in db.session.query(Urls).filter_by(pod=name).all(): DS_vector += convert_to_array(u.vector) for k, v in convert_string_to_dict(u.freqs).items(): if k in freqs: freqs[k] += int(v) else: freqs[k] = int(v) DS_vector = convert_to_string(normalise(DS_vector)) c = 0 for w in sorted(freqs, key=freqs.get, reverse=True): word_vector += w + ':' + str(freqs[w]) + ' ' c += 1 if c == 300: break return DS_vector, word_vector
def compute_query_vectors(query): """ Make distribution for query """ words = query.rstrip('\n').split() # Only retain arguments which are in the distributional semantic space vecs_to_add = [] words_for_freqs = [] for word in words: words_for_freqs.append(word) if word in dm_dict_en: vecs_to_add.append(word) vbase = np.array([]) # Add vectors together if vecs_to_add: # Take first word in vecs_to_add to start addition vbase = dm_dict_en[vecs_to_add[0]] for vec in vecs_to_add[1:]: vbase = vbase + dm_dict_en[vec] vbase = normalise(vbase) freqs = compute_freq_vector(words_for_freqs) return vbase, freqs
def pod_from_scratch(name,url,language,description): if not db.session.query(Pods).filter_by(url=url).all(): p = Pods(url=url) db.session.add(p) db.session.commit() p = Pods.query.filter(Pods.url == url).first() p.name = name p.description = description p.language = language #Using compute_query_vector as hack to get vectors from pod's name vector, freqs = compute_query_vectors(name.lower()+' '+description.lower(), dm_dict_en) p.DS_vector = convert_to_string(normalise(vector)) word_vector = "" c = 0 for w in sorted(freqs, key=freqs.get, reverse=True): word_vector += w + ':' + str(freqs[w]) + ' ' c += 1 if c == 300: break p.word_vector = word_vector if not p.registered: p.registered = False db.session.commit()