def compute_q(self, f_df, q_df, return_f_nbow=False): logger.info('Computing question wmds') f_nbow = { row.Index: self.nbowify(row.Index, row.original) for row in f_df.itertuples() } nb_facts = len(f_nbow) q_nbow = { row.Index + nb_facts: self.nbowify(row.Index + nb_facts, row.original) for row in q_df.itertuples() } merged_fnbow = copy.copy(f_nbow) merged_fnbow.update(q_nbow) q_calc = WMD(SpacyEmbeddings(self.nlp), merged_fnbow, vocabulary_min=1, verbosity=logging.WARNING) q_calc.cache_centroids() q_closest = pd.Series( np.array([ i for i, _ in q_calc.nearest_neighbors( idx, k=self.config.nearest_k_visible) if i < nb_facts ]) for idx in tqdm(q_nbow.keys(), desc='Question wmd...')) return (q_closest, f_nbow) if return_f_nbow else q_closest
def calc_smd(input_f, output_f="", WORD_REP='elmo', METRIC='sms'): if WORD_REP == "elmo": MODEL = ElmoEmbedder() inF = open(input_f, 'r') inLines = inF.readlines() inF.close() #print("Found", len(inLines), "documents") token_doc_list, text_doc_list = tokenize_texts(inLines, WORD_REP, tokenize=True) count = 0 results_list = [] for doc_id in range(len(token_doc_list)): doc = token_doc_list[doc_id] text = text_doc_list[doc_id] # transform doc to ID list, both words and/or sentences. get ID dict that maps to emb [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text, WORD_REP, MODEL, METRIC) # get D values [ref_id_list, hyp_id_list], [ref_d, hyp_d] = get_weights([ref_ids, hyp_ids], METRIC) # format doc as expected: {id: (id, ref_id_list, ref_d)} doc_dict = {"0": ("ref", ref_id_list, ref_d), "1": ("hyp", hyp_id_list, hyp_d)} calc = WMD(rep_map, doc_dict, vocabulary_min=1) try: dist = calc.nearest_neighbors(str(0), k=1, early_stop=1)[0][1] # how far is hyp from ref? except Exception as e: print(e) sim = math.exp(-dist) # switch to similarity results_list.append(sim) if doc_id == int((len(token_doc_list) / 10.) * count): print(str(count * 10) + "% done with calculations") count += 1 if output_f != "": print_score(inLines, output_f, results_list) else: print("Results: ", np.mean(results_list)) return 'Done!'
def calculate_similarity(candidate, next_id, emb): s = time.time() can_doc = calculator.nlp(candidate[essay_field]) similarities = [] next_id, emb, can_id_list, can_weights = calculator.get_embeddings_ids_weights( can_doc, next_id, emb, method) nbow = {"hypothesis": ("hypothesis", can_id_list, can_weights)} for id, item in processed_refs.items(): ref_weights = item["weights"] ref_id_list = item["id_list"] nbow[id] = (id, ref_id_list, ref_weights) calc = WMD(emb, nbow, vocabulary_min=1) # print("NBOW") # print(nbow) distances = calc.nearest_neighbors("hypothesis", k=len(processed_refs), early_stop=1) for id, dist in distances: similarity = np.exp(-dist) similarities.append({ "candidate_id": candidate[id_field], "reference_id": id, "similarity": similarity, "dist": dist, "score": candidate[score_field] }) print("Time taken for candidate " + str(candidate[id_field]) + " is " + str(time.time() - s)) return similarities
def word_mover_distance(word_embedding_dict_source, word_embedding_dict_target): """ Calculate euclidean distance between two dictionaries of arrays. """ try: source = np.array(word_embedding_dict_source, dtype=np.float32) target = np.array(word_embedding_dict_target, np.float32) embeddings = np.concatenate((source, target)) source_len = source.shape[0] target_len = target.shape[0] source_words = np.array([i for i in range(source_len)], dtype=np.int32) target_words = np.array( [source_len + i for i in range(target_len)], dtype=np.int32) source_weights = np.array([1 for i in range(source_len)], dtype=np.int32) target_weights = np.array([1 for i in range(target_len)], dtype=np.int32) nbow = { "source": ("source", source_words, source_weights), "target": ("target", target_words, target_weights) } calc = WMD(embeddings, nbow, vocabulary_min=2) return calc.nearest_neighbors("source", 1)[0][1] except (ValueError, IndexError): return 0
def get_similarity_dist(self, candidate, reference, method): emb, nbow = self.get_emb_nbow(candidate, reference, method) # print("emb:", emb.keys()) # print("nbow:", nbow) calc = WMD(emb, nbow, vocabulary_min=1) dist = calc.nearest_neighbors("reference", k=1, early_stop=1) # print("Dist:", dist) dist = dist[0][1] similarity = np.exp(-dist) return similarity, dist
def get_sim(doc, text, wordrep, model, metric): [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text, wordrep, model, metric) [ref_id_list, hyp_id_list], [ref_d, hyp_d] = get_weights([ref_ids, hyp_ids], metric) # format doc as expected: {id: (id, ref_id_list, ref_d)} doc_dict = {"0": ("ref", ref_id_list, ref_d), "1": ("hyp", hyp_id_list, hyp_d)} calc = WMD(rep_map, doc_dict, vocabulary_min=1) try: dist = calc.nearest_neighbors(str(0), k=1, early_stop=1)[0][1] # how far is hyp from ref? except: return 0.0 sim = math.exp(-dist) # switch to similarity return sim
def __init__(self, id2vec=None, df=None, nbow=None, verbosity=logging.DEBUG, wmd_cache_centroids=True, wmd_kwargs=None, gcs_bucket=None, repo2nbow_kwargs=None, initialize_environment=True): if initialize_environment: initialize() self._log = logging.getLogger("similar_repos") self._log.setLevel(verbosity) if gcs_bucket: backend = create_backend(args="bucket=" + gcs_bucket) else: backend = create_backend() if id2vec is None: self._id2vec = Id2Vec(log_level=verbosity, backend=backend) else: assert isinstance(id2vec, Id2Vec) self._id2vec = id2vec self._log.info("Loaded id2vec model: %s", self._id2vec) if df is None: if df is not False: self._df = DocumentFrequencies(log_level=verbosity, backend=backend) else: self._df = None self._log.warning("Disabled document frequencies - you will " "not be able to query custom repositories.") else: assert isinstance(df, DocumentFrequencies) self._df = df self._log.info("Loaded document frequencies: %s", self._df) if nbow is None: self._nbow = NBOW(log_level=verbosity, backend=backend) else: assert isinstance(nbow, NBOW) self._nbow = nbow self._log.info("Loaded nBOW model: %s", self._nbow) self._repo2nbow = Repo2nBOW(self._id2vec, self._df, log_level=verbosity, **(repo2nbow_kwargs or {})) self._log.info("Creating the WMD engine...") self._wmd = WMD(self._id2vec.embeddings, self._nbow, verbosity=verbosity, **(wmd_kwargs or {})) if wmd_cache_centroids: self._wmd.cache_centroids()
def calc_smd(inLines, model): global nlp nlp = model #print("Found", len(inLines), "documents") # TODO: rewrite this token_doc_list, text_doc_list = tokenize_texts(inLines) results_list = [] for doc_id in range(len(token_doc_list)): doc = token_doc_list[doc_id] # TODO: rewrite this text = text_doc_list[doc_id] # transform doc to ID list, both words and/or sentences. get ID dict that maps to emb # TODO: rewrite this try: [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text) except ValueError: print(inLines[doc_id]) print('ValueError: max() arg is an empty sequence; get_embeddings') continue # get D values [ref_id_list, hyp_id_list], [ref_d, hyp_d] = get_weights([ref_ids, hyp_ids]) # format doc as expected: {id: (id, ref_id_list, ref_d)} doc_dict = { "0": ("ref", ref_id_list, ref_d), "1": ("hyp", hyp_id_list, hyp_d) } calc = WMD(rep_map, doc_dict, vocabulary_min=1) try: dist = calc.nearest_neighbors( str(0), k=1, early_stop=1)[0][1] # how far is hyp from ref? sim = math.exp(-dist) # switch to similarity except IndexError: print( 'dist = calc.nearest_neighbors(str(0), k=1, early_stop=1)[0][1]' ) print('IndexError: list index out of range') print(inLines[doc_id]) continue except UnboundLocalError: print('dist could not be calculated') print(inLines[doc_id]) continue except ValueError: print('Too little vocabulary') print(inLines[doc_id]) continue results_list.append((inLines[doc_id], sim)) return score_list(results_list)
def main(args): # location of input-output data_dir = args.input train_loc = os.path.join(data_dir, "train.csv") test_loc = os.path.join(data_dir, "test.csv") train = pd.read_csv(train_loc) test = pd.read_csv(test_loc) nlp = spacy.load("en_core_web_lg") def extract_bow(data, text_col, id_col, uniq_tokens=None): documents = {} sent = {} if uniq_tokens is None: uniq_tokens = {} for i, line in tqdm(data.iterrows(), total=data.shape[0]): # TODO: remove after debugging sent[line[id_col]] = line[text_col] if i == 1000: # TODO: remove after experiments break text = nlp(line[text_col]) tokens = [t for t in text if t.is_alpha and not t.is_stop] orths = {t.text: t.orth for t in tokens} words = Counter(t.text for t in tokens if t.text in nlp.vocab) sorted_words = sorted(words) documents[line[id_col]] = (line[id_col], [ orths[t] for t in sorted_words ], np.array([words[t] for t in sorted_words], dtype=np.float32)) return documents, uniq_tokens, sent tid1_nlp, uniq_tokens, tid1_sent = extract_bow(train, text_col="title1_en", id_col="tid1") tid2_nlp, uniq_tokens, tid2_sent = extract_bow(train, text_col="title2_en", id_col="tid2", uniq_tokens=uniq_tokens) class SpacyEmbeddings(object): def __getitem__(self, item): return nlp.vocab[item].vector from wmd import TailVocabularyOptimizer tid1_calc = WMD(SpacyEmbeddings(), tid1_nlp, vocabulary_min=10, vocabulary_optimizer=TailVocabularyOptimizer(1.)) tid2_calc = WMD(SpacyEmbeddings(), tid2_nlp, vocabulary_min=3)
def get_similar_bugs(self, query): query = self.text_preprocess(self.get_text(query)) words = [ word for word in set(chain(query, *self.corpus)) if word in self.w2vmodel.wv ] indices, words = zip(*sorted(((index, word) for ( index, _), word in zip(self.dictionary.doc2bow(words), words)))) query = dict(self.tfidf[self.dictionary.doc2bow(query)]) query = [(new_index, query[dict_index]) for new_index, dict_index in enumerate(indices) if dict_index in query] documents = [ dict(self.tfidf[self.dictionary.doc2bow(document)]) for document in self.corpus ] documents = [[(new_index, document[dict_index]) for new_index, dict_index in enumerate(indices) if dict_index in document] for document in documents] embeddings = np.array([self.w2vmodel.wv[word] for word in words], dtype=np.float32) nbow = dict(((index, list(chain([None], zip(*document)))) for index, document in enumerate(documents) if document != [])) nbow["query"] = tuple([None] + list(zip(*query))) distances = WMD(embeddings, nbow, vocabulary_min=1).nearest_neighbors("query") return [ self.bug_ids[distance[0]] for distance in distances if self.bug_ids[distance[0]] != query["id"] ]
def get_distance(self, query1, query2): query1 = self.text_preprocess(self.get_text(query1)) query2 = self.text_preprocess(self.get_text(query2)) words = [ word for word in set(chain(query1, query2, *self.corpus)) if word in self.w2vmodel.wv ] indices, words = zip(*sorted(((index, word) for ( index, _), word in zip(self.dictionary.doc2bow(words), words)))) query1 = dict(self.tfidf[self.dictionary.doc2bow(query1)]) query2 = dict(self.tfidf[self.dictionary.doc2bow(query2)]) query1 = [(new_index, query1[dict_index]) for new_index, dict_index in enumerate(indices) if dict_index in query1] query2 = [(new_index, query2[dict_index]) for new_index, dict_index in enumerate(indices) if dict_index in query2] embeddings = np.array([self.w2vmodel.wv[word] for word in words], dtype=np.float32) nbow = {} nbow["query1"] = tuple([None] + list(zip(*query1))) nbow["query2"] = tuple([None] + list(zip(*query2))) distances = WMD(embeddings, nbow, vocabulary_min=1).nearest_neighbors("query1") return distances[0][1]
def __init__(self, id2vec=None, df=None, nbow=None, prune_df_threshold=1, wmd_cache_centroids=True, wmd_kwargs: Dict[str, Any] = None, languages: Tuple[List, bool] = (None, False), engine_kwargs: Dict[str, Any] = None): backend = create_backend() if id2vec is None: self._id2vec = Id2Vec().load(backend=backend) else: assert isinstance(id2vec, Id2Vec) self._id2vec = id2vec self._log.info("Loaded id2vec model: %s", self._id2vec) if df is None: if df is not False: self._df = DocumentFrequencies().load(backend=backend) else: self._df = None self._log.warning("Disabled document frequencies - you will " "not be able to query custom repositories.") else: assert isinstance(df, DocumentFrequencies) self._df = df if self._df is not None: self._df = self._df.prune(prune_df_threshold) self._log.info("Loaded document frequencies: %s", self._df) if nbow is None: self._bow = BOW().load(backend=backend) else: assert isinstance(nbow, BOW) self._bow = nbow self._log.info("Loaded BOW model: %s", self._bow) assert self._bow.get_dep("id2vec")["uuid"] == self._id2vec.meta["uuid"] if len(self._id2vec) != self._bow.matrix.shape[1]: raise ValueError( "Models do not match: id2vec has %s tokens while nbow has %s" % (len(self._id2vec), self._bow.matrix.shape[1])) self._log.info("Creating the WMD engine...") self._wmd = WMD(self._id2vec.embeddings, self._bow, **(wmd_kwargs or {})) if wmd_cache_centroids: self._wmd.cache_centroids() self._languages = languages self._engine_kwargs = engine_kwargs
def __init__(self, id2vec=None, df=None, nbow=None, prune_df_threshold=1, verbosity=logging.DEBUG, wmd_cache_centroids=True, wmd_kwargs=None, gcs_bucket=None, repo2nbow_kwargs=None, initialize_environment=True): if initialize_environment: initialize() self._log = logging.getLogger("similar_repos") self._log.setLevel(verbosity) if gcs_bucket: backend = create_backend(args="bucket=" + gcs_bucket) else: backend = create_backend() if id2vec is None: self._id2vec = Id2Vec(log_level=verbosity).load(backend=backend) else: assert isinstance(id2vec, Id2Vec) self._id2vec = id2vec self._log.info("Loaded id2vec model: %s", self._id2vec) if df is None: if df is not False: self._df = DocumentFrequencies(log_level=verbosity).load(backend=backend) else: self._df = None self._log.warning("Disabled document frequencies - you will " "not be able to query custom repositories.") else: assert isinstance(df, DocumentFrequencies) self._df = df if self._df is not None: self._df = self._df.prune(prune_df_threshold) self._log.info("Loaded document frequencies: %s", self._df) if nbow is None: self._nbow = NBOW(log_level=verbosity).load(backend=backend) else: assert isinstance(nbow, NBOW) self._nbow = nbow self._log.info("Loaded nBOW model: %s", self._nbow) self._repo2nbow = Repo2nBOW( self._id2vec, self._df, log_level=verbosity, **(repo2nbow_kwargs or {})) assert self._nbow.dep("id2vec")["uuid"] == self._id2vec.meta["uuid"] if len(self._id2vec) != self._nbow.matrix.shape[1]: raise ValueError("Models do not match: id2vec has %s tokens while nbow has %s" % (len(self._id2vec), self._nbow.matrix.shape[1])) self._log.info("Creating the WMD engine...") self._wmd = WMD(self._id2vec.embeddings, self._nbow, verbosity=verbosity, **(wmd_kwargs or {})) if wmd_cache_centroids: self._wmd.cache_centroids()
def compute_f(self, f_df, f_nbow=None): logger.info('Computing fact wmds') f_nbow = { row.Index: self.nbowify(row.Index, row.original) for row in f_df.itertuples() } if f_nbow is None else f_nbow f_calc = WMD(SpacyEmbeddings(self.nlp), f_nbow, vocabulary_min=1, verbosity=logging.WARNING) f_calc.cache_centroids() f_closest = pd.Series( np.array([ i for i, _ in f_calc.nearest_neighbors( idx, k=self.config.nearest_k_visible) ]) for idx in tqdm(f_nbow.keys(), desc='Fact wmd...')) return f_closest
def fit_wme_model(self, d_max=6, r=1024): self._r = r possible_words = list(self.word_mapping) nbow = {} for i in range(r): d = random.sample(range(1, d_max + 1), 1)[0] random_doc = random.sample(possible_words, d) doc_embeddings = [self.word_mapping[word] for word in random_doc] document, idf_ids = zip(*[(word.glove_id, word.idf_id) for word in doc_embeddings]) words = np.array(document, dtype=np.uint32) idf_weights = np.array( [self.tf_idf_model.idf_[idf_id] for idf_id in idf_ids], dtype=np.float32) weights = idf_weights doc_id = '#' + str(i + 1) nbow[doc_id] = (doc_id, words, weights) self.wmd = WMD(embeddings=self.glove_model.word_vectors.astype( np.float32), nbow=nbow, vocabulary_min=1)
def retrieve(self, top_id: str, k=None, only=None): assert only, 'not searching anything' index = self.db.mapping delta = common.timer() def to_nbow(doc_id): # transform to the nbow model used by wmd.WMD: # ('human readable name', 'item identifiers', 'weights') doc = index[doc_id] return (doc_id, doc.idx, doc.freq) docs = {d: to_nbow(d) for d in only + [top_id]} calc = WMDR(self.emb, docs, vocabulary_min=2) calc.cache_centroids() nn = calc.nearest_neighbors(top_id, k=k) self._times.append(delta()) assert len(nn) == k, f'{len(nn)} not {k}' return [Result(*n) for n in nn]
def calc_smd(opts, output_f=""): inF = open(opts.input_file, 'r') inLines = inF.readlines() inF.close() print("Found", len(inLines), "documents") token_doc_list, text_doc_list = tokenize_texts(inLines) count = 0 results_list = [] for doc_id in range(len(token_doc_list)): doc = token_doc_list[doc_id] text = text_doc_list[doc_id] # transform doc to ID list, both words and/or sentences. get ID dict that maps to emb [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text) # get D values [ref_id_list, hyp_id_list], [ref_d, hyp_d] = get_weights([ref_ids, hyp_ids]) # format doc as expected: {id: (id, ref_id_list, ref_d)} doc_dict = { "0": ("ref", ref_id_list, ref_d), "1": ("hyp", hyp_id_list, hyp_d) } calc = WMD(rep_map, doc_dict, vocabulary_min=1) try: dist = calc.nearest_neighbors( str(0), k=1, early_stop=1)[0][1] # how far is hyp from ref? except: print(doc, text) sim = math.exp(-dist) # switch to similarity results_list.append(sim) if doc_id == int((len(token_doc_list) / 10.) * count): print(str(count * 10) + "% done with calculations") count += 1 # added by wchen to compute correlation scores with human annotated scores hscoreF = open(opts.score_file, 'r') hscoreLines = hscoreF.readlines() hscoreF.close() compute_corrs(opts, results_list, hscoreLines)
def calc_smd(ref, hyp, model): global nlp nlp = model doc, text = tokenize_texts([ref, hyp]) count = 0 results_list = [] # transform doc to ID list, both words and/or sentences. get ID dict that maps to emb [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text) # get D values [ref_id_list, hyp_id_list], [ref_d, hyp_d] = get_weights([ref_ids, hyp_ids]) # format doc as expected: {id: (id, ref_id_list, ref_d)} doc_dict = { "0": ("ref", ref_id_list, ref_d), "1": ("hyp", hyp_id_list, hyp_d) } calc = WMD(rep_map, doc_dict, vocabulary_min=1) try: dist = calc.nearest_neighbors( str(0), k=1, early_stop=1)[0][1] # how far is hyp from ref? except: print(doc, text) sim = math.exp(-dist) # switch to similarity return sim
def compute_sentence_similarity(): nlp = spacy.load('en_core_web_sm') nlp.add_pipe(WMD.SpacySimilarityHook(nlp), last=True) all_score = [] for i in range(len(all_summary)): if len(all_summary[i]) == 1: all_score.append([1.0]) continue score = [] for j in range(1, len(all_summary[i])): doc1 = nlp(all_summary[i][j-1]) doc2 = nlp(all_summary[i][j]) try: score.append(1.0/(1.0 + math.exp(-doc1.similarity(doc2)+7))) except: score.append(1.0) all_score.append(score) return all_score
def SimilarityHook(doc): return WMD.SpacySimilarityHook(doc)
class Preprocessor: """ Class Preprocessor implements all necessary operations to prepare raw text input for modeling """ WordLemma = namedtuple( 'WordLemma', ['start_char', 'end_char', 'text', 'label_' ]) # proxy for a class representing a text span with a label WordEmbedding = namedtuple( 'WordEmbedding', ['idf_id', 'glove_id']) # necessary for determining # correct weights for embedding vectors def __init__(self, glove_components=300, min_df=5, max_df=0.4): self.glove_model = Glove(no_components=glove_components) self.tf_idf_model = TfidfVectorizer(min_df=min_df, max_df=max_df, token_pattern='[^\s]+', lowercase=False) self.word_mapping = None self.embedding_dim = glove_components self.wmd = None self._r = None def preprocess(self, text: str) -> str: raise NotImplementedError def sentence_tokenizer(self, text: str) -> List[str]: raise NotImplementedError def fit_glove(self, sentences, window, epochs): corpus = Corpus() corpus.fit(sentences, window=window) self.glove_model.fit(corpus.matrix, epochs=epochs, no_threads=8) self.glove_model.add_dictionary(corpus.dictionary) def fit_tf_idf(self, articles): self.tf_idf_model.fit(articles) def fit(self, inputs, return_clean=True, clean=True, window=10, epochs=100): if clean: print('Cleaning {n_inputs} inputs...'.format(n_inputs=len(inputs)), end='') clean_inputs = [self.preprocess(input) for input in inputs] print('Done!') else: clean_inputs = inputs[:] print('Training Tf-idf model...', end='') self.fit_tf_idf(clean_inputs) print('Done!') sentences_per_input = [ self.sentence_tokenizer(input) for input in clean_inputs ] sentences = itertools.chain.from_iterable(sentences_per_input) tokenized_sentences = [sentence.split() for sentence in sentences] print('Training Glove model...', end='') self.fit_glove(tokenized_sentences, window=window, epochs=epochs) print('Done!') valid_words = set.intersection( set(self.glove_model.dictionary.keys()), set(self.tf_idf_model.vocabulary_.keys())) self.word_mapping = { word: self.WordEmbedding(glove_id=self.glove_model.dictionary[word], idf_id=self.tf_idf_model.vocabulary_[word]) for word in valid_words } if return_clean: return clean_inputs def article_to_input(self, article): tokens = article.split() word_embeddings = [ self.word_mapping[token] for token in tokens if token in self.word_mapping ] weight_ids = [(we.glove_id, we.idf_id) for we in word_embeddings] glove_ids, idf_ids = zip(*weight_ids) words = np.array(glove_ids, dtype=np.uint32) weights = np.array( [self.tf_idf_model.idf_[idf_id] for idf_id in idf_ids], dtype=np.float32) return words, weights def _single_embed(self, article, embedding_function, preprocess): if preprocess: article = self.clean(article) try: words, weights = self.article_to_input(article) except ValueError: print('Empty embedding\n\n', article) return np.zeros(shape=(self.embedding_dim, )) return embedding_function(words, weights) def _embed(self, inputs, embedding_function, preprocess): if isinstance(inputs, list): return np.array([ self._single_embed(input, embedding_function, preprocess) for input in inputs ]) return self._single_embed(inputs, embedding_function, preprocess) def _idf_embedding(self, words, weights): word_vectors = np.array( [self.glove_model.word_vectors[glove_id] for glove_id in words]) idf_weights = weights / np.sum(weights) return np.dot(idf_weights, word_vectors) def idf_embed(self, article, preprocess=False): return self._embed(article, embedding_function=self._idf_embedding, preprocess=preprocess) def fit_wme_model(self, d_max=6, r=1024): self._r = r possible_words = list(self.word_mapping) nbow = {} for i in range(r): d = random.sample(range(1, d_max + 1), 1)[0] random_doc = random.sample(possible_words, d) doc_embeddings = [self.word_mapping[word] for word in random_doc] document, idf_ids = zip(*[(word.glove_id, word.idf_id) for word in doc_embeddings]) words = np.array(document, dtype=np.uint32) idf_weights = np.array( [self.tf_idf_model.idf_[idf_id] for idf_id in idf_ids], dtype=np.float32) weights = idf_weights doc_id = '#' + str(i + 1) nbow[doc_id] = (doc_id, words, weights) self.wmd = WMD(embeddings=self.glove_model.word_vectors.astype( np.float32), nbow=nbow, vocabulary_min=1) def _wme_embedding(self, words, weights, gamma): distances = np.array([ self.wmd._WMD_batch(words, weights, '#' + str(i + 1)) for i in range(self._r) ]) return 1 / math.sqrt(self._r) * np.exp(-gamma * distances) def wme_embed(self, article, preprocess=False, gamma=0.19): embedding_function = partial(self._wme_embedding, gamma=gamma) return self._embed(article, embedding_function=embedding_function, preprocess=preprocess)
class SimilarRepositories: GITHUB_URL_RE = re.compile( r"(https://|ssh://git@|git://)(github.com/[^/]+/[^/]+)(|.git|/)") _log = logging.getLogger("SimilarRepositories") def __init__(self, id2vec=None, df=None, nbow=None, prune_df_threshold=1, wmd_cache_centroids=True, wmd_kwargs: Dict[str, Any] = None, languages: Tuple[List, bool] = (None, False), engine_kwargs: Dict[str, Any] = None): backend = create_backend() if id2vec is None: self._id2vec = Id2Vec().load(backend=backend) else: assert isinstance(id2vec, Id2Vec) self._id2vec = id2vec self._log.info("Loaded id2vec model: %s", self._id2vec) if df is None: if df is not False: self._df = DocumentFrequencies().load(backend=backend) else: self._df = None self._log.warning("Disabled document frequencies - you will " "not be able to query custom repositories.") else: assert isinstance(df, DocumentFrequencies) self._df = df if self._df is not None: self._df = self._df.prune(prune_df_threshold) self._log.info("Loaded document frequencies: %s", self._df) if nbow is None: self._bow = BOW().load(backend=backend) else: assert isinstance(nbow, BOW) self._bow = nbow self._log.info("Loaded BOW model: %s", self._bow) assert self._bow.get_dep("id2vec")["uuid"] == self._id2vec.meta["uuid"] if len(self._id2vec) != self._bow.matrix.shape[1]: raise ValueError( "Models do not match: id2vec has %s tokens while nbow has %s" % (len(self._id2vec), self._bow.matrix.shape[1])) self._log.info("Creating the WMD engine...") self._wmd = WMD(self._id2vec.embeddings, self._bow, **(wmd_kwargs or {})) if wmd_cache_centroids: self._wmd.cache_centroids() self._languages = languages self._engine_kwargs = engine_kwargs def query(self, url_or_path_or_name: str, **kwargs) -> List[Tuple[str, float]]: try: repo_index = self._bow.documents.index(url_or_path_or_name) except ValueError: repo_index = -1 if repo_index == -1: match = self.GITHUB_URL_RE.match(url_or_path_or_name) if match is not None: name = match.group(2) try: repo_index = self._bow.documents.index(name) except ValueError: pass if repo_index >= 0: neighbours = self._query_domestic(repo_index, **kwargs) else: neighbours = self._query_foreign(url_or_path_or_name, **kwargs) neighbours = [(self._bow[n[0]][0], n[1]) for n in neighbours] return neighbours def _query_domestic(self, repo_index, **kwargs): return self._wmd.nearest_neighbors(repo_index, **kwargs) def _query_foreign(self, url_or_path: str, **kwargs): df = self._df if df is None: raise ValueError("Cannot query custom repositories if the " "document frequencies are disabled.") with tempfile.TemporaryDirectory(prefix="vecino-") as tempdir: target = os.path.join(tempdir, "repo") if os.path.isdir(url_or_path): url_or_path = os.path.abspath(url_or_path) os.symlink(url_or_path, target, target_is_directory=True) repo_format = "standard" else: self._log.info("Cloning %s to %s", url_or_path, target) porcelain.clone(url_or_path, target, bare=True, outstream=sys.stderr) repo_format = "bare" bow = repo2bow(tempdir, repo_format, 1, df, *self._languages, engine_kwargs=self._engine_kwargs) ibow = {} for key, val in bow.items(): try: ibow[self._id2vec[key]] = val except KeyError: continue words, weights = zip(*sorted(ibow.items())) return self._wmd.nearest_neighbors((words, weights), **kwargs)
# coding: utf-8 # pip3 install wmd # https://github.com/src-d/wmd-relax # 论文: http://www.cs.cornell.edu/~kilian/papers/wmd_metric.pdf import time import numpy from wmd import WMD import pickle embeddings = numpy.array([[0.1, 1], [1, 0.1], [0.8, 0.7]], dtype=numpy.float32) nbow = { # key: 序号, 向量, 权重; "first": ("#1", [0, 1, 2], numpy.array([1.5, 0.3, 0.5], dtype=numpy.float32)), "你好": ("#3", [1, 2], numpy.array([1.3, 0.5], dtype=numpy.float32)), "second": ("#2", [0, 1], numpy.array([0.75, 0.15], dtype=numpy.float32))} calc = WMD(embeddings, nbow, vocabulary_min=2) origin = "first" print(calc.nearest_neighbors(origin)) model_file = '/home/gswewf/yhb/model/wx_vector_char.pkl' with open(model_file, "rb")as f: w2v_model = pickle.load(f, encoding='iso-8859-1') # 此处耗内存 60.8 MiB words_list = [] w_emb = [] for word, emb in w2v_model.items(): words_list.append(word) w_emb.append(emb) from jieba.analyse.tfidf import TFIDF
# pip3 install wmd # https://github.com/src-d/wmd-relax # 论文: http://www.cs.cornell.edu/~kilian/papers/wmd_metric.pdf import time import numpy from wmd import WMD import pickle embeddings = numpy.array([[0.1, 1], [1, 0.1], [0.8, 0.7]], dtype=numpy.float32) nbow = { # key: 序号, 向量, 权重; "first": ("#1", [0, 1, 2], numpy.array([1.5, 0.3, 0.5], dtype=numpy.float32)), "你好": ("#3", [1, 2], numpy.array([1.3, 0.5], dtype=numpy.float32)), "second": ("#2", [0, 1], numpy.array([0.75, 0.15], dtype=numpy.float32)) } calc = WMD(embeddings, nbow, vocabulary_min=2) origin = "first" print(calc.nearest_neighbors(origin)) model_file = '/home/gswewf/yhb/model/wx_vector_char.pkl' with open(model_file, "rb") as f: w2v_model = pickle.load(f, encoding='iso-8859-1') # 此处耗内存 60.8 MiB words_list = [] w_emb = [] for word, emb in w2v_model.items(): words_list.append(word) w_emb.append(emb) from jieba.analyse.tfidf import TFIDF
import pandas as pd import re import glob import sys sys.path.append("./BERT/pytorch-pretrained-BERT-master") sys.path.append("./BERT") from pytorch_pretrained_bert import BertTokenizer, BertModel from wmd import WMD from torch.nn.modules.distance import CosineSimilarity torch_emb_sim = CosineSimilarity() from bert_score import score as bert_score nlp = spacy.load('en_core_web_md') nlp.add_pipe(WMD.SpacySimilarityHook(nlp), last=True) def _clean_text(txt): return txt.lower() class CFRInstance(object): def __init__( self, original_context: str, cf_context: str, original_ending: str, predicted_ending: str, gold_cf_endings: List[str], ):
text = nlp(doctext) tokens = [t for t in text if t.is_alpha and not t.is_stop] words = Counter(t.text for t in tokens) orths = {t.text: t.orth for t in tokens} sorted_words = sorted(words) documents[title] = (title, [orths[t] for t in sorted_words], numpy.array([words[t] for t in sorted_words], dtype=numpy.float32)) # Hook in WMD class SpacyEmbeddings(object): def __getitem__(self, item): return nlp.vocab[item].vector embeddings = SpacyEmbeddings() vocabulary_min = 10 calc = WMD(embeddings, documents, vocabulary_min=vocabulary_min) print("calculating") # Germany shall be closer to Spain than to Google neigbors_of_germany = calc.nearest_neighbors(titles[0]) for title, relevance in neigbors_of_germany: print("%24s\t%s" % (title, relevance))
# List of page names we will fetch from Wikipedia and query for similarity titles = sys.argv[1:] or ["Germany", "Spain", "Google"] documents = {} for title in titles: print("fetching", title) pages = requests.get( "https://en.wikipedia.org/w/api.php?action=query&format=json&titles=%s" "&prop=extracts&explaintext" % title).json()["query"]["pages"] print("parsing", title) text = nlp(next(iter(pages.values()))["extract"]) tokens = [t for t in text if t.is_alpha and not t.is_stop] words = Counter(t.text for t in tokens) orths = {t.text: t.orth for t in tokens} sorted_words = sorted(words) documents[title] = (title, [orths[t] for t in sorted_words], numpy.array([words[t] for t in sorted_words], dtype=numpy.float32)) # Hook in WMD class SpacyEmbeddings(object): def __getitem__(self, item): return nlp.vocab[item].vector calc = WMD(SpacyEmbeddings(), documents) print("calculating") # Germany shall be closer to Spain than to Google for title, relevance in calc.nearest_neighbors(titles[0]): print("%24s\t%s" % (title, relevance))
class SimilarRepositories: GITHUB_URL_RE = re.compile( r"(https://|ssh://git@|git://)(github.com/[^/]+/[^/]+)(|.git|/)") def __init__(self, id2vec=None, df=None, nbow=None, prune_df_threshold=1, verbosity=logging.DEBUG, wmd_cache_centroids=True, wmd_kwargs=None, gcs_bucket=None, repo2nbow_kwargs=None, initialize_environment=True): if initialize_environment: initialize() self._log = logging.getLogger("similar_repos") self._log.setLevel(verbosity) if gcs_bucket: backend = create_backend(args="bucket=" + gcs_bucket) else: backend = create_backend() if id2vec is None: self._id2vec = Id2Vec(log_level=verbosity).load(backend=backend) else: assert isinstance(id2vec, Id2Vec) self._id2vec = id2vec self._log.info("Loaded id2vec model: %s", self._id2vec) if df is None: if df is not False: self._df = DocumentFrequencies(log_level=verbosity).load( backend=backend) else: self._df = None self._log.warning("Disabled document frequencies - you will " "not be able to query custom repositories.") else: assert isinstance(df, DocumentFrequencies) self._df = df if self._df is not None: self._df = self._df.prune(prune_df_threshold) self._log.info("Loaded document frequencies: %s", self._df) if nbow is None: self._nbow = NBOW(log_level=verbosity).load(backend=backend) else: assert isinstance(nbow, NBOW) self._nbow = nbow self._log.info("Loaded nBOW model: %s", self._nbow) self._repo2nbow = Repo2nBOW(self._id2vec, self._df, log_level=verbosity, **(repo2nbow_kwargs or {})) assert self._nbow.get_dependency( "id2vec")["uuid"] == self._id2vec.meta["uuid"] if len(self._id2vec) != self._nbow.matrix.shape[1]: raise ValueError( "Models do not match: id2vec has %s tokens while nbow has %s" % (len(self._id2vec), self._nbow.matrix.shape[1])) self._log.info("Creating the WMD engine...") self._wmd = WMD(self._id2vec.embeddings, self._nbow, verbosity=verbosity, **(wmd_kwargs or {})) if wmd_cache_centroids: self._wmd.cache_centroids() def query(self, url_or_path_or_name, **kwargs): try: repo_index = self._nbow.repository_index_by_name( url_or_path_or_name) except KeyError: repo_index = -1 if repo_index == -1: match = self.GITHUB_URL_RE.match(url_or_path_or_name) if match is not None: name = match.group(2) try: repo_index = self._nbow.repository_index_by_name(name) except KeyError: pass if repo_index >= 0: neighbours = self._query_domestic(repo_index, **kwargs) else: neighbours = self._query_foreign(url_or_path_or_name, **kwargs) neighbours = [(self._nbow[n[0]][0], n[1]) for n in neighbours] return neighbours @staticmethod def unicorn_query(repo_name, id2vec=None, nbow=None, wmd_kwargs=None, query_wmd_kwargs=None): sr = SimilarRepositories(id2vec=id2vec, df=False, nbow=nbow, wmd_kwargs=wmd_kwargs or { "vocabulary_min": 50, "vocabulary_max": 500 }) return sr.query( repo_name, **(query_wmd_kwargs or { "early_stop": 0.1, "max_time": 180, "skipped_stop": 0.95 })) def _query_domestic(self, repo_index, **kwargs): return self._wmd.nearest_neighbors(repo_index, **kwargs) def _query_foreign(self, url_or_path, **kwargs): if self._df is None: raise ValueError("Cannot query custom repositories if the " "document frequencies are disabled.") nbow_dict = self._repo2nbow.convert_repository(url_or_path) words = sorted(nbow_dict.keys()) weights = [nbow_dict[k] for k in words] return self._wmd.nearest_neighbors((words, weights), **kwargs)