def word_mover_distance(word_embedding_dict_source, word_embedding_dict_target): """ Calculate euclidean distance between two dictionaries of arrays. """ try: source = np.array(word_embedding_dict_source, dtype=np.float32) target = np.array(word_embedding_dict_target, np.float32) embeddings = np.concatenate((source, target)) source_len = source.shape[0] target_len = target.shape[0] source_words = np.array([i for i in range(source_len)], dtype=np.int32) target_words = np.array( [source_len + i for i in range(target_len)], dtype=np.int32) source_weights = np.array([1 for i in range(source_len)], dtype=np.int32) target_weights = np.array([1 for i in range(target_len)], dtype=np.int32) nbow = { "source": ("source", source_words, source_weights), "target": ("target", target_words, target_weights) } calc = WMD(embeddings, nbow, vocabulary_min=2) return calc.nearest_neighbors("source", 1)[0][1] except (ValueError, IndexError): return 0
def calc_smd(input_f, output_f="", WORD_REP='elmo', METRIC='sms'): if WORD_REP == "elmo": MODEL = ElmoEmbedder() inF = open(input_f, 'r') inLines = inF.readlines() inF.close() #print("Found", len(inLines), "documents") token_doc_list, text_doc_list = tokenize_texts(inLines, WORD_REP, tokenize=True) count = 0 results_list = [] for doc_id in range(len(token_doc_list)): doc = token_doc_list[doc_id] text = text_doc_list[doc_id] # transform doc to ID list, both words and/or sentences. get ID dict that maps to emb [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text, WORD_REP, MODEL, METRIC) # get D values [ref_id_list, hyp_id_list], [ref_d, hyp_d] = get_weights([ref_ids, hyp_ids], METRIC) # format doc as expected: {id: (id, ref_id_list, ref_d)} doc_dict = {"0": ("ref", ref_id_list, ref_d), "1": ("hyp", hyp_id_list, hyp_d)} calc = WMD(rep_map, doc_dict, vocabulary_min=1) try: dist = calc.nearest_neighbors(str(0), k=1, early_stop=1)[0][1] # how far is hyp from ref? except Exception as e: print(e) sim = math.exp(-dist) # switch to similarity results_list.append(sim) if doc_id == int((len(token_doc_list) / 10.) * count): print(str(count * 10) + "% done with calculations") count += 1 if output_f != "": print_score(inLines, output_f, results_list) else: print("Results: ", np.mean(results_list)) return 'Done!'
def calculate_similarity(candidate, next_id, emb): s = time.time() can_doc = calculator.nlp(candidate[essay_field]) similarities = [] next_id, emb, can_id_list, can_weights = calculator.get_embeddings_ids_weights( can_doc, next_id, emb, method) nbow = {"hypothesis": ("hypothesis", can_id_list, can_weights)} for id, item in processed_refs.items(): ref_weights = item["weights"] ref_id_list = item["id_list"] nbow[id] = (id, ref_id_list, ref_weights) calc = WMD(emb, nbow, vocabulary_min=1) # print("NBOW") # print(nbow) distances = calc.nearest_neighbors("hypothesis", k=len(processed_refs), early_stop=1) for id, dist in distances: similarity = np.exp(-dist) similarities.append({ "candidate_id": candidate[id_field], "reference_id": id, "similarity": similarity, "dist": dist, "score": candidate[score_field] }) print("Time taken for candidate " + str(candidate[id_field]) + " is " + str(time.time() - s)) return similarities
def compute_q(self, f_df, q_df, return_f_nbow=False): logger.info('Computing question wmds') f_nbow = { row.Index: self.nbowify(row.Index, row.original) for row in f_df.itertuples() } nb_facts = len(f_nbow) q_nbow = { row.Index + nb_facts: self.nbowify(row.Index + nb_facts, row.original) for row in q_df.itertuples() } merged_fnbow = copy.copy(f_nbow) merged_fnbow.update(q_nbow) q_calc = WMD(SpacyEmbeddings(self.nlp), merged_fnbow, vocabulary_min=1, verbosity=logging.WARNING) q_calc.cache_centroids() q_closest = pd.Series( np.array([ i for i, _ in q_calc.nearest_neighbors( idx, k=self.config.nearest_k_visible) if i < nb_facts ]) for idx in tqdm(q_nbow.keys(), desc='Question wmd...')) return (q_closest, f_nbow) if return_f_nbow else q_closest
def get_similarity_dist(self, candidate, reference, method): emb, nbow = self.get_emb_nbow(candidate, reference, method) # print("emb:", emb.keys()) # print("nbow:", nbow) calc = WMD(emb, nbow, vocabulary_min=1) dist = calc.nearest_neighbors("reference", k=1, early_stop=1) # print("Dist:", dist) dist = dist[0][1] similarity = np.exp(-dist) return similarity, dist
def get_sim(doc, text, wordrep, model, metric): [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text, wordrep, model, metric) [ref_id_list, hyp_id_list], [ref_d, hyp_d] = get_weights([ref_ids, hyp_ids], metric) # format doc as expected: {id: (id, ref_id_list, ref_d)} doc_dict = {"0": ("ref", ref_id_list, ref_d), "1": ("hyp", hyp_id_list, hyp_d)} calc = WMD(rep_map, doc_dict, vocabulary_min=1) try: dist = calc.nearest_neighbors(str(0), k=1, early_stop=1)[0][1] # how far is hyp from ref? except: return 0.0 sim = math.exp(-dist) # switch to similarity return sim
def calc_smd(inLines, model): global nlp nlp = model #print("Found", len(inLines), "documents") # TODO: rewrite this token_doc_list, text_doc_list = tokenize_texts(inLines) results_list = [] for doc_id in range(len(token_doc_list)): doc = token_doc_list[doc_id] # TODO: rewrite this text = text_doc_list[doc_id] # transform doc to ID list, both words and/or sentences. get ID dict that maps to emb # TODO: rewrite this try: [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text) except ValueError: print(inLines[doc_id]) print('ValueError: max() arg is an empty sequence; get_embeddings') continue # get D values [ref_id_list, hyp_id_list], [ref_d, hyp_d] = get_weights([ref_ids, hyp_ids]) # format doc as expected: {id: (id, ref_id_list, ref_d)} doc_dict = { "0": ("ref", ref_id_list, ref_d), "1": ("hyp", hyp_id_list, hyp_d) } calc = WMD(rep_map, doc_dict, vocabulary_min=1) try: dist = calc.nearest_neighbors( str(0), k=1, early_stop=1)[0][1] # how far is hyp from ref? sim = math.exp(-dist) # switch to similarity except IndexError: print( 'dist = calc.nearest_neighbors(str(0), k=1, early_stop=1)[0][1]' ) print('IndexError: list index out of range') print(inLines[doc_id]) continue except UnboundLocalError: print('dist could not be calculated') print(inLines[doc_id]) continue except ValueError: print('Too little vocabulary') print(inLines[doc_id]) continue results_list.append((inLines[doc_id], sim)) return score_list(results_list)
def compute_f(self, f_df, f_nbow=None): logger.info('Computing fact wmds') f_nbow = { row.Index: self.nbowify(row.Index, row.original) for row in f_df.itertuples() } if f_nbow is None else f_nbow f_calc = WMD(SpacyEmbeddings(self.nlp), f_nbow, vocabulary_min=1, verbosity=logging.WARNING) f_calc.cache_centroids() f_closest = pd.Series( np.array([ i for i, _ in f_calc.nearest_neighbors( idx, k=self.config.nearest_k_visible) ]) for idx in tqdm(f_nbow.keys(), desc='Fact wmd...')) return f_closest
def retrieve(self, top_id: str, k=None, only=None): assert only, 'not searching anything' index = self.db.mapping delta = common.timer() def to_nbow(doc_id): # transform to the nbow model used by wmd.WMD: # ('human readable name', 'item identifiers', 'weights') doc = index[doc_id] return (doc_id, doc.idx, doc.freq) docs = {d: to_nbow(d) for d in only + [top_id]} calc = WMDR(self.emb, docs, vocabulary_min=2) calc.cache_centroids() nn = calc.nearest_neighbors(top_id, k=k) self._times.append(delta()) assert len(nn) == k, f'{len(nn)} not {k}' return [Result(*n) for n in nn]
def calc_smd(opts, output_f=""): inF = open(opts.input_file, 'r') inLines = inF.readlines() inF.close() print("Found", len(inLines), "documents") token_doc_list, text_doc_list = tokenize_texts(inLines) count = 0 results_list = [] for doc_id in range(len(token_doc_list)): doc = token_doc_list[doc_id] text = text_doc_list[doc_id] # transform doc to ID list, both words and/or sentences. get ID dict that maps to emb [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text) # get D values [ref_id_list, hyp_id_list], [ref_d, hyp_d] = get_weights([ref_ids, hyp_ids]) # format doc as expected: {id: (id, ref_id_list, ref_d)} doc_dict = { "0": ("ref", ref_id_list, ref_d), "1": ("hyp", hyp_id_list, hyp_d) } calc = WMD(rep_map, doc_dict, vocabulary_min=1) try: dist = calc.nearest_neighbors( str(0), k=1, early_stop=1)[0][1] # how far is hyp from ref? except: print(doc, text) sim = math.exp(-dist) # switch to similarity results_list.append(sim) if doc_id == int((len(token_doc_list) / 10.) * count): print(str(count * 10) + "% done with calculations") count += 1 # added by wchen to compute correlation scores with human annotated scores hscoreF = open(opts.score_file, 'r') hscoreLines = hscoreF.readlines() hscoreF.close() compute_corrs(opts, results_list, hscoreLines)
def calc_smd(ref, hyp, model): global nlp nlp = model doc, text = tokenize_texts([ref, hyp]) count = 0 results_list = [] # transform doc to ID list, both words and/or sentences. get ID dict that maps to emb [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text) # get D values [ref_id_list, hyp_id_list], [ref_d, hyp_d] = get_weights([ref_ids, hyp_ids]) # format doc as expected: {id: (id, ref_id_list, ref_d)} doc_dict = { "0": ("ref", ref_id_list, ref_d), "1": ("hyp", hyp_id_list, hyp_d) } calc = WMD(rep_map, doc_dict, vocabulary_min=1) try: dist = calc.nearest_neighbors( str(0), k=1, early_stop=1)[0][1] # how far is hyp from ref? except: print(doc, text) sim = math.exp(-dist) # switch to similarity return sim
text = nlp(doctext) tokens = [t for t in text if t.is_alpha and not t.is_stop] words = Counter(t.text for t in tokens) orths = {t.text: t.orth for t in tokens} sorted_words = sorted(words) documents[title] = (title, [orths[t] for t in sorted_words], numpy.array([words[t] for t in sorted_words], dtype=numpy.float32)) # Hook in WMD class SpacyEmbeddings(object): def __getitem__(self, item): return nlp.vocab[item].vector embeddings = SpacyEmbeddings() vocabulary_min = 10 calc = WMD(embeddings, documents, vocabulary_min=vocabulary_min) print("calculating") # Germany shall be closer to Spain than to Google neigbors_of_germany = calc.nearest_neighbors(titles[0]) for title, relevance in neigbors_of_germany: print("%24s\t%s" % (title, relevance))
class SimilarRepositories: GITHUB_URL_RE = re.compile( r"(https://|ssh://git@|git://)(github.com/[^/]+/[^/]+)(|.git|/)") _log = logging.getLogger("SimilarRepositories") def __init__(self, id2vec=None, df=None, nbow=None, prune_df_threshold=1, wmd_cache_centroids=True, wmd_kwargs: Dict[str, Any] = None, languages: Tuple[List, bool] = (None, False), engine_kwargs: Dict[str, Any] = None): backend = create_backend() if id2vec is None: self._id2vec = Id2Vec().load(backend=backend) else: assert isinstance(id2vec, Id2Vec) self._id2vec = id2vec self._log.info("Loaded id2vec model: %s", self._id2vec) if df is None: if df is not False: self._df = DocumentFrequencies().load(backend=backend) else: self._df = None self._log.warning("Disabled document frequencies - you will " "not be able to query custom repositories.") else: assert isinstance(df, DocumentFrequencies) self._df = df if self._df is not None: self._df = self._df.prune(prune_df_threshold) self._log.info("Loaded document frequencies: %s", self._df) if nbow is None: self._bow = BOW().load(backend=backend) else: assert isinstance(nbow, BOW) self._bow = nbow self._log.info("Loaded BOW model: %s", self._bow) assert self._bow.get_dep("id2vec")["uuid"] == self._id2vec.meta["uuid"] if len(self._id2vec) != self._bow.matrix.shape[1]: raise ValueError( "Models do not match: id2vec has %s tokens while nbow has %s" % (len(self._id2vec), self._bow.matrix.shape[1])) self._log.info("Creating the WMD engine...") self._wmd = WMD(self._id2vec.embeddings, self._bow, **(wmd_kwargs or {})) if wmd_cache_centroids: self._wmd.cache_centroids() self._languages = languages self._engine_kwargs = engine_kwargs def query(self, url_or_path_or_name: str, **kwargs) -> List[Tuple[str, float]]: try: repo_index = self._bow.documents.index(url_or_path_or_name) except ValueError: repo_index = -1 if repo_index == -1: match = self.GITHUB_URL_RE.match(url_or_path_or_name) if match is not None: name = match.group(2) try: repo_index = self._bow.documents.index(name) except ValueError: pass if repo_index >= 0: neighbours = self._query_domestic(repo_index, **kwargs) else: neighbours = self._query_foreign(url_or_path_or_name, **kwargs) neighbours = [(self._bow[n[0]][0], n[1]) for n in neighbours] return neighbours def _query_domestic(self, repo_index, **kwargs): return self._wmd.nearest_neighbors(repo_index, **kwargs) def _query_foreign(self, url_or_path: str, **kwargs): df = self._df if df is None: raise ValueError("Cannot query custom repositories if the " "document frequencies are disabled.") with tempfile.TemporaryDirectory(prefix="vecino-") as tempdir: target = os.path.join(tempdir, "repo") if os.path.isdir(url_or_path): url_or_path = os.path.abspath(url_or_path) os.symlink(url_or_path, target, target_is_directory=True) repo_format = "standard" else: self._log.info("Cloning %s to %s", url_or_path, target) porcelain.clone(url_or_path, target, bare=True, outstream=sys.stderr) repo_format = "bare" bow = repo2bow(tempdir, repo_format, 1, df, *self._languages, engine_kwargs=self._engine_kwargs) ibow = {} for key, val in bow.items(): try: ibow[self._id2vec[key]] = val except KeyError: continue words, weights = zip(*sorted(ibow.items())) return self._wmd.nearest_neighbors((words, weights), **kwargs)
# 论文: http://www.cs.cornell.edu/~kilian/papers/wmd_metric.pdf import time import numpy from wmd import WMD import pickle embeddings = numpy.array([[0.1, 1], [1, 0.1], [0.8, 0.7]], dtype=numpy.float32) nbow = { # key: 序号, 向量, 权重; "first": ("#1", [0, 1, 2], numpy.array([1.5, 0.3, 0.5], dtype=numpy.float32)), "你好": ("#3", [1, 2], numpy.array([1.3, 0.5], dtype=numpy.float32)), "second": ("#2", [0, 1], numpy.array([0.75, 0.15], dtype=numpy.float32)) } calc = WMD(embeddings, nbow, vocabulary_min=2) origin = "first" print(calc.nearest_neighbors(origin)) model_file = '/home/gswewf/yhb/model/wx_vector_char.pkl' with open(model_file, "rb") as f: w2v_model = pickle.load(f, encoding='iso-8859-1') # 此处耗内存 60.8 MiB words_list = [] w_emb = [] for word, emb in w2v_model.items(): words_list.append(word) w_emb.append(emb) from jieba.analyse.tfidf import TFIDF tf_idf = TFIDF() # tf_idf.idf_freq.get('我')
class SimilarRepositories: GITHUB_URL_RE = re.compile( r"(https://|ssh://git@|git://)(github.com/[^/]+/[^/]+)(|.git|/)") def __init__(self, id2vec=None, df=None, nbow=None, prune_df_threshold=1, verbosity=logging.DEBUG, wmd_cache_centroids=True, wmd_kwargs=None, gcs_bucket=None, repo2nbow_kwargs=None, initialize_environment=True): if initialize_environment: initialize() self._log = logging.getLogger("similar_repos") self._log.setLevel(verbosity) if gcs_bucket: backend = create_backend(args="bucket=" + gcs_bucket) else: backend = create_backend() if id2vec is None: self._id2vec = Id2Vec(log_level=verbosity).load(backend=backend) else: assert isinstance(id2vec, Id2Vec) self._id2vec = id2vec self._log.info("Loaded id2vec model: %s", self._id2vec) if df is None: if df is not False: self._df = DocumentFrequencies(log_level=verbosity).load( backend=backend) else: self._df = None self._log.warning("Disabled document frequencies - you will " "not be able to query custom repositories.") else: assert isinstance(df, DocumentFrequencies) self._df = df if self._df is not None: self._df = self._df.prune(prune_df_threshold) self._log.info("Loaded document frequencies: %s", self._df) if nbow is None: self._nbow = NBOW(log_level=verbosity).load(backend=backend) else: assert isinstance(nbow, NBOW) self._nbow = nbow self._log.info("Loaded nBOW model: %s", self._nbow) self._repo2nbow = Repo2nBOW(self._id2vec, self._df, log_level=verbosity, **(repo2nbow_kwargs or {})) assert self._nbow.get_dependency( "id2vec")["uuid"] == self._id2vec.meta["uuid"] if len(self._id2vec) != self._nbow.matrix.shape[1]: raise ValueError( "Models do not match: id2vec has %s tokens while nbow has %s" % (len(self._id2vec), self._nbow.matrix.shape[1])) self._log.info("Creating the WMD engine...") self._wmd = WMD(self._id2vec.embeddings, self._nbow, verbosity=verbosity, **(wmd_kwargs or {})) if wmd_cache_centroids: self._wmd.cache_centroids() def query(self, url_or_path_or_name, **kwargs): try: repo_index = self._nbow.repository_index_by_name( url_or_path_or_name) except KeyError: repo_index = -1 if repo_index == -1: match = self.GITHUB_URL_RE.match(url_or_path_or_name) if match is not None: name = match.group(2) try: repo_index = self._nbow.repository_index_by_name(name) except KeyError: pass if repo_index >= 0: neighbours = self._query_domestic(repo_index, **kwargs) else: neighbours = self._query_foreign(url_or_path_or_name, **kwargs) neighbours = [(self._nbow[n[0]][0], n[1]) for n in neighbours] return neighbours @staticmethod def unicorn_query(repo_name, id2vec=None, nbow=None, wmd_kwargs=None, query_wmd_kwargs=None): sr = SimilarRepositories(id2vec=id2vec, df=False, nbow=nbow, wmd_kwargs=wmd_kwargs or { "vocabulary_min": 50, "vocabulary_max": 500 }) return sr.query( repo_name, **(query_wmd_kwargs or { "early_stop": 0.1, "max_time": 180, "skipped_stop": 0.95 })) def _query_domestic(self, repo_index, **kwargs): return self._wmd.nearest_neighbors(repo_index, **kwargs) def _query_foreign(self, url_or_path, **kwargs): if self._df is None: raise ValueError("Cannot query custom repositories if the " "document frequencies are disabled.") nbow_dict = self._repo2nbow.convert_repository(url_or_path) words = sorted(nbow_dict.keys()) weights = [nbow_dict[k] for k in words] return self._wmd.nearest_neighbors((words, weights), **kwargs)
# List of page names we will fetch from Wikipedia and query for similarity titles = sys.argv[1:] or ["Germany", "Spain", "Google"] documents = {} for title in titles: print("fetching", title) pages = requests.get( "https://en.wikipedia.org/w/api.php?action=query&format=json&titles=%s" "&prop=extracts&explaintext" % title).json()["query"]["pages"] print("parsing", title) text = nlp(next(iter(pages.values()))["extract"]) tokens = [t for t in text if t.is_alpha and not t.is_stop] words = Counter(t.text for t in tokens) orths = {t.text: t.orth for t in tokens} sorted_words = sorted(words) documents[title] = (title, [orths[t] for t in sorted_words], numpy.array([words[t] for t in sorted_words], dtype=numpy.float32)) # Hook in WMD class SpacyEmbeddings(object): def __getitem__(self, item): return nlp.vocab[item].vector calc = WMD(SpacyEmbeddings(), documents) print("calculating") # Germany shall be closer to Spain than to Google for title, relevance in calc.nearest_neighbors(titles[0]): print("%24s\t%s" % (title, relevance))
# pip3 install wmd # https://github.com/src-d/wmd-relax # 论文: http://www.cs.cornell.edu/~kilian/papers/wmd_metric.pdf import time import numpy from wmd import WMD import pickle embeddings = numpy.array([[0.1, 1], [1, 0.1], [0.8, 0.7]], dtype=numpy.float32) nbow = { # key: 序号, 向量, 权重; "first": ("#1", [0, 1, 2], numpy.array([1.5, 0.3, 0.5], dtype=numpy.float32)), "你好": ("#3", [1, 2], numpy.array([1.3, 0.5], dtype=numpy.float32)), "second": ("#2", [0, 1], numpy.array([0.75, 0.15], dtype=numpy.float32))} calc = WMD(embeddings, nbow, vocabulary_min=2) origin = "first" print(calc.nearest_neighbors(origin)) model_file = '/home/gswewf/yhb/model/wx_vector_char.pkl' with open(model_file, "rb")as f: w2v_model = pickle.load(f, encoding='iso-8859-1') # 此处耗内存 60.8 MiB words_list = [] w_emb = [] for word, emb in w2v_model.items(): words_list.append(word) w_emb.append(emb) from jieba.analyse.tfidf import TFIDF tf_idf = TFIDF() # tf_idf.idf_freq.get('我')