Ejemplo n.º 1
0
    def compute_q(self, f_df, q_df, return_f_nbow=False):
        logger.info('Computing question wmds')
        f_nbow = {
            row.Index: self.nbowify(row.Index, row.original)
            for row in f_df.itertuples()
        }
        nb_facts = len(f_nbow)
        q_nbow = {
            row.Index + nb_facts: self.nbowify(row.Index + nb_facts,
                                               row.original)
            for row in q_df.itertuples()
        }

        merged_fnbow = copy.copy(f_nbow)
        merged_fnbow.update(q_nbow)
        q_calc = WMD(SpacyEmbeddings(self.nlp),
                     merged_fnbow,
                     vocabulary_min=1,
                     verbosity=logging.WARNING)
        q_calc.cache_centroids()
        q_closest = pd.Series(
            np.array([
                i for i, _ in q_calc.nearest_neighbors(
                    idx, k=self.config.nearest_k_visible) if i < nb_facts
            ]) for idx in tqdm(q_nbow.keys(), desc='Question wmd...'))
        return (q_closest, f_nbow) if return_f_nbow else q_closest
Ejemplo n.º 2
0
def calc_smd(input_f, output_f="", WORD_REP='elmo', METRIC='sms'):
    if WORD_REP == "elmo":
        MODEL = ElmoEmbedder()
    inF = open(input_f, 'r')
    inLines = inF.readlines()
    inF.close()
    #print("Found", len(inLines), "documents")
    token_doc_list, text_doc_list = tokenize_texts(inLines, WORD_REP, tokenize=True)
    count = 0
    results_list = []
    for doc_id in range(len(token_doc_list)):
        doc = token_doc_list[doc_id]
        text = text_doc_list[doc_id]
        # transform doc to ID list, both words and/or sentences. get ID dict that maps to emb
        [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text, WORD_REP, MODEL, METRIC)
        # get D values
        [ref_id_list, hyp_id_list], [ref_d, hyp_d] = get_weights([ref_ids, hyp_ids], METRIC)
        # format doc as expected: {id: (id, ref_id_list, ref_d)}
        doc_dict = {"0": ("ref", ref_id_list, ref_d), "1": ("hyp", hyp_id_list, hyp_d)}
        calc = WMD(rep_map, doc_dict, vocabulary_min=1)
        try:
            dist = calc.nearest_neighbors(str(0), k=1, early_stop=1)[0][1]  # how far is hyp from ref?
        except Exception as e:
            print(e)
        sim = math.exp(-dist)  # switch to similarity
        results_list.append(sim)
        if doc_id == int((len(token_doc_list) / 10.) * count):
            print(str(count * 10) + "% done with calculations")
            count += 1
    if output_f != "":
        print_score(inLines, output_f, results_list)
    else:
        print("Results: ", np.mean(results_list))

    return 'Done!'
Ejemplo n.º 3
0
def calculate_similarity(candidate, next_id, emb):
    s = time.time()
    can_doc = calculator.nlp(candidate[essay_field])
    similarities = []
    next_id, emb, can_id_list, can_weights = calculator.get_embeddings_ids_weights(
        can_doc, next_id, emb, method)
    nbow = {"hypothesis": ("hypothesis", can_id_list, can_weights)}

    for id, item in processed_refs.items():
        ref_weights = item["weights"]
        ref_id_list = item["id_list"]
        nbow[id] = (id, ref_id_list, ref_weights)

    calc = WMD(emb, nbow, vocabulary_min=1)
    # print("NBOW")
    # print(nbow)
    distances = calc.nearest_neighbors("hypothesis",
                                       k=len(processed_refs),
                                       early_stop=1)

    for id, dist in distances:
        similarity = np.exp(-dist)
        similarities.append({
            "candidate_id": candidate[id_field],
            "reference_id": id,
            "similarity": similarity,
            "dist": dist,
            "score": candidate[score_field]
        })
    print("Time taken for candidate " + str(candidate[id_field]) + " is " +
          str(time.time() - s))

    return similarities
    def word_mover_distance(word_embedding_dict_source,
                            word_embedding_dict_target):
        """ Calculate euclidean distance between two dictionaries of arrays.
        """
        try:
            source = np.array(word_embedding_dict_source, dtype=np.float32)
            target = np.array(word_embedding_dict_target, np.float32)
            embeddings = np.concatenate((source, target))

            source_len = source.shape[0]
            target_len = target.shape[0]

            source_words = np.array([i for i in range(source_len)],
                                    dtype=np.int32)
            target_words = np.array(
                [source_len + i for i in range(target_len)], dtype=np.int32)

            source_weights = np.array([1 for i in range(source_len)],
                                      dtype=np.int32)
            target_weights = np.array([1 for i in range(target_len)],
                                      dtype=np.int32)

            nbow = {
                "source": ("source", source_words, source_weights),
                "target": ("target", target_words, target_weights)
            }
            calc = WMD(embeddings, nbow, vocabulary_min=2)

            return calc.nearest_neighbors("source", 1)[0][1]

        except (ValueError, IndexError):
            return 0
 def get_similarity_dist(self, candidate, reference, method):
     emb, nbow = self.get_emb_nbow(candidate, reference, method)
     # print("emb:", emb.keys())
     # print("nbow:", nbow)
     calc = WMD(emb, nbow, vocabulary_min=1)
     dist = calc.nearest_neighbors("reference", k=1, early_stop=1)
     # print("Dist:", dist)
     dist = dist[0][1]
     similarity = np.exp(-dist)
     return similarity, dist
Ejemplo n.º 6
0
def get_sim(doc, text, wordrep, model, metric):
    [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text, wordrep, model, metric)
    [ref_id_list, hyp_id_list], [ref_d, hyp_d] = get_weights([ref_ids, hyp_ids], metric)
    # format doc as expected: {id: (id, ref_id_list, ref_d)}
    doc_dict = {"0": ("ref", ref_id_list, ref_d), "1": ("hyp", hyp_id_list, hyp_d)}
    calc = WMD(rep_map, doc_dict, vocabulary_min=1)
    try:
        dist = calc.nearest_neighbors(str(0), k=1, early_stop=1)[0][1]  # how far is hyp from ref?
    except:
        return 0.0
    sim = math.exp(-dist)  # switch to similarity
    return sim
Ejemplo n.º 7
0
 def __init__(self,
              id2vec=None,
              df=None,
              nbow=None,
              verbosity=logging.DEBUG,
              wmd_cache_centroids=True,
              wmd_kwargs=None,
              gcs_bucket=None,
              repo2nbow_kwargs=None,
              initialize_environment=True):
     if initialize_environment:
         initialize()
     self._log = logging.getLogger("similar_repos")
     self._log.setLevel(verbosity)
     if gcs_bucket:
         backend = create_backend(args="bucket=" + gcs_bucket)
     else:
         backend = create_backend()
     if id2vec is None:
         self._id2vec = Id2Vec(log_level=verbosity, backend=backend)
     else:
         assert isinstance(id2vec, Id2Vec)
         self._id2vec = id2vec
     self._log.info("Loaded id2vec model: %s", self._id2vec)
     if df is None:
         if df is not False:
             self._df = DocumentFrequencies(log_level=verbosity,
                                            backend=backend)
         else:
             self._df = None
             self._log.warning("Disabled document frequencies - you will "
                               "not be able to query custom repositories.")
     else:
         assert isinstance(df, DocumentFrequencies)
         self._df = df
     self._log.info("Loaded document frequencies: %s", self._df)
     if nbow is None:
         self._nbow = NBOW(log_level=verbosity, backend=backend)
     else:
         assert isinstance(nbow, NBOW)
         self._nbow = nbow
     self._log.info("Loaded nBOW model: %s", self._nbow)
     self._repo2nbow = Repo2nBOW(self._id2vec,
                                 self._df,
                                 log_level=verbosity,
                                 **(repo2nbow_kwargs or {}))
     self._log.info("Creating the WMD engine...")
     self._wmd = WMD(self._id2vec.embeddings,
                     self._nbow,
                     verbosity=verbosity,
                     **(wmd_kwargs or {}))
     if wmd_cache_centroids:
         self._wmd.cache_centroids()
Ejemplo n.º 8
0
def calc_smd(inLines, model):
    global nlp
    nlp = model
    #print("Found", len(inLines), "documents")
    # TODO: rewrite this
    token_doc_list, text_doc_list = tokenize_texts(inLines)
    results_list = []
    for doc_id in range(len(token_doc_list)):
        doc = token_doc_list[doc_id]
        # TODO: rewrite this
        text = text_doc_list[doc_id]
        # transform doc to ID list, both words and/or sentences. get ID dict that maps to emb
        # TODO: rewrite this
        try:
            [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text)
        except ValueError:
            print(inLines[doc_id])
            print('ValueError: max() arg is an empty sequence; get_embeddings')
            continue
        # get D values
        [ref_id_list, hyp_id_list], [ref_d,
                                     hyp_d] = get_weights([ref_ids, hyp_ids])
        # format doc as expected: {id: (id, ref_id_list, ref_d)}
        doc_dict = {
            "0": ("ref", ref_id_list, ref_d),
            "1": ("hyp", hyp_id_list, hyp_d)
        }
        calc = WMD(rep_map, doc_dict, vocabulary_min=1)
        try:
            dist = calc.nearest_neighbors(
                str(0), k=1, early_stop=1)[0][1]  # how far is hyp from ref?
            sim = math.exp(-dist)  # switch to similarity
        except IndexError:
            print(
                'dist = calc.nearest_neighbors(str(0), k=1, early_stop=1)[0][1]'
            )
            print('IndexError: list index out of range')
            print(inLines[doc_id])
            continue
        except UnboundLocalError:
            print('dist could not be calculated')
            print(inLines[doc_id])
            continue
        except ValueError:
            print('Too little vocabulary')
            print(inLines[doc_id])
            continue

        results_list.append((inLines[doc_id], sim))

    return score_list(results_list)
Ejemplo n.º 9
0
def main(args):
    # location of input-output
    data_dir = args.input
    train_loc = os.path.join(data_dir, "train.csv")
    test_loc = os.path.join(data_dir, "test.csv")
    train = pd.read_csv(train_loc)
    test = pd.read_csv(test_loc)
    nlp = spacy.load("en_core_web_lg")

    def extract_bow(data, text_col, id_col, uniq_tokens=None):
        documents = {}
        sent = {}
        if uniq_tokens is None:
            uniq_tokens = {}
        for i, line in tqdm(data.iterrows(), total=data.shape[0]):
            # TODO: remove after debugging
            sent[line[id_col]] = line[text_col]
            if i == 1000:
                # TODO: remove after experiments
                break

            text = nlp(line[text_col])
            tokens = [t for t in text if t.is_alpha and not t.is_stop]
            orths = {t.text: t.orth for t in tokens}
            words = Counter(t.text for t in tokens if t.text in nlp.vocab)
            sorted_words = sorted(words)
            documents[line[id_col]] = (line[id_col], [
                orths[t] for t in sorted_words
            ], np.array([words[t] for t in sorted_words], dtype=np.float32))
        return documents, uniq_tokens, sent

    tid1_nlp, uniq_tokens, tid1_sent = extract_bow(train,
                                                   text_col="title1_en",
                                                   id_col="tid1")
    tid2_nlp, uniq_tokens, tid2_sent = extract_bow(train,
                                                   text_col="title2_en",
                                                   id_col="tid2",
                                                   uniq_tokens=uniq_tokens)

    class SpacyEmbeddings(object):
        def __getitem__(self, item):
            return nlp.vocab[item].vector

    from wmd import TailVocabularyOptimizer

    tid1_calc = WMD(SpacyEmbeddings(),
                    tid1_nlp,
                    vocabulary_min=10,
                    vocabulary_optimizer=TailVocabularyOptimizer(1.))
    tid2_calc = WMD(SpacyEmbeddings(), tid2_nlp, vocabulary_min=3)
Ejemplo n.º 10
0
    def get_similar_bugs(self, query):

        query = self.text_preprocess(self.get_text(query))
        words = [
            word for word in set(chain(query, *self.corpus))
            if word in self.w2vmodel.wv
        ]
        indices, words = zip(*sorted(((index, word) for (
            index, _), word in zip(self.dictionary.doc2bow(words), words))))
        query = dict(self.tfidf[self.dictionary.doc2bow(query)])
        query = [(new_index, query[dict_index])
                 for new_index, dict_index in enumerate(indices)
                 if dict_index in query]
        documents = [
            dict(self.tfidf[self.dictionary.doc2bow(document)])
            for document in self.corpus
        ]
        documents = [[(new_index, document[dict_index])
                      for new_index, dict_index in enumerate(indices)
                      if dict_index in document] for document in documents]
        embeddings = np.array([self.w2vmodel.wv[word] for word in words],
                              dtype=np.float32)
        nbow = dict(((index, list(chain([None], zip(*document))))
                     for index, document in enumerate(documents)
                     if document != []))
        nbow["query"] = tuple([None] + list(zip(*query)))
        distances = WMD(embeddings, nbow,
                        vocabulary_min=1).nearest_neighbors("query")

        return [
            self.bug_ids[distance[0]] for distance in distances
            if self.bug_ids[distance[0]] != query["id"]
        ]
Ejemplo n.º 11
0
    def get_distance(self, query1, query2):
        query1 = self.text_preprocess(self.get_text(query1))
        query2 = self.text_preprocess(self.get_text(query2))

        words = [
            word for word in set(chain(query1, query2, *self.corpus))
            if word in self.w2vmodel.wv
        ]
        indices, words = zip(*sorted(((index, word) for (
            index, _), word in zip(self.dictionary.doc2bow(words), words))))
        query1 = dict(self.tfidf[self.dictionary.doc2bow(query1)])
        query2 = dict(self.tfidf[self.dictionary.doc2bow(query2)])

        query1 = [(new_index, query1[dict_index])
                  for new_index, dict_index in enumerate(indices)
                  if dict_index in query1]
        query2 = [(new_index, query2[dict_index])
                  for new_index, dict_index in enumerate(indices)
                  if dict_index in query2]
        embeddings = np.array([self.w2vmodel.wv[word] for word in words],
                              dtype=np.float32)
        nbow = {}
        nbow["query1"] = tuple([None] + list(zip(*query1)))
        nbow["query2"] = tuple([None] + list(zip(*query2)))
        distances = WMD(embeddings, nbow,
                        vocabulary_min=1).nearest_neighbors("query1")

        return distances[0][1]
Ejemplo n.º 12
0
 def __init__(self,
              id2vec=None,
              df=None,
              nbow=None,
              prune_df_threshold=1,
              wmd_cache_centroids=True,
              wmd_kwargs: Dict[str, Any] = None,
              languages: Tuple[List, bool] = (None, False),
              engine_kwargs: Dict[str, Any] = None):
     backend = create_backend()
     if id2vec is None:
         self._id2vec = Id2Vec().load(backend=backend)
     else:
         assert isinstance(id2vec, Id2Vec)
         self._id2vec = id2vec
     self._log.info("Loaded id2vec model: %s", self._id2vec)
     if df is None:
         if df is not False:
             self._df = DocumentFrequencies().load(backend=backend)
         else:
             self._df = None
             self._log.warning("Disabled document frequencies - you will "
                               "not be able to query custom repositories.")
     else:
         assert isinstance(df, DocumentFrequencies)
         self._df = df
     if self._df is not None:
         self._df = self._df.prune(prune_df_threshold)
     self._log.info("Loaded document frequencies: %s", self._df)
     if nbow is None:
         self._bow = BOW().load(backend=backend)
     else:
         assert isinstance(nbow, BOW)
         self._bow = nbow
     self._log.info("Loaded BOW model: %s", self._bow)
     assert self._bow.get_dep("id2vec")["uuid"] == self._id2vec.meta["uuid"]
     if len(self._id2vec) != self._bow.matrix.shape[1]:
         raise ValueError(
             "Models do not match: id2vec has %s tokens while nbow has %s" %
             (len(self._id2vec), self._bow.matrix.shape[1]))
     self._log.info("Creating the WMD engine...")
     self._wmd = WMD(self._id2vec.embeddings, self._bow, **(wmd_kwargs
                                                            or {}))
     if wmd_cache_centroids:
         self._wmd.cache_centroids()
     self._languages = languages
     self._engine_kwargs = engine_kwargs
Ejemplo n.º 13
0
 def __init__(self, id2vec=None, df=None, nbow=None, prune_df_threshold=1,
              verbosity=logging.DEBUG, wmd_cache_centroids=True, wmd_kwargs=None,
              gcs_bucket=None, repo2nbow_kwargs=None, initialize_environment=True):
     if initialize_environment:
         initialize()
     self._log = logging.getLogger("similar_repos")
     self._log.setLevel(verbosity)
     if gcs_bucket:
         backend = create_backend(args="bucket=" + gcs_bucket)
     else:
         backend = create_backend()
     if id2vec is None:
         self._id2vec = Id2Vec(log_level=verbosity).load(backend=backend)
     else:
         assert isinstance(id2vec, Id2Vec)
         self._id2vec = id2vec
     self._log.info("Loaded id2vec model: %s", self._id2vec)
     if df is None:
         if df is not False:
             self._df = DocumentFrequencies(log_level=verbosity).load(backend=backend)
         else:
             self._df = None
             self._log.warning("Disabled document frequencies - you will "
                               "not be able to query custom repositories.")
     else:
         assert isinstance(df, DocumentFrequencies)
         self._df = df
     if self._df is not None:
         self._df = self._df.prune(prune_df_threshold)
     self._log.info("Loaded document frequencies: %s", self._df)
     if nbow is None:
         self._nbow = NBOW(log_level=verbosity).load(backend=backend)
     else:
         assert isinstance(nbow, NBOW)
         self._nbow = nbow
     self._log.info("Loaded nBOW model: %s", self._nbow)
     self._repo2nbow = Repo2nBOW(
         self._id2vec, self._df, log_level=verbosity, **(repo2nbow_kwargs or {}))
     assert self._nbow.dep("id2vec")["uuid"] == self._id2vec.meta["uuid"]
     if len(self._id2vec) != self._nbow.matrix.shape[1]:
         raise ValueError("Models do not match: id2vec has %s tokens while nbow has %s" %
                          (len(self._id2vec), self._nbow.matrix.shape[1]))
     self._log.info("Creating the WMD engine...")
     self._wmd = WMD(self._id2vec.embeddings, self._nbow,
                     verbosity=verbosity, **(wmd_kwargs or {}))
     if wmd_cache_centroids:
         self._wmd.cache_centroids()
Ejemplo n.º 14
0
    def compute_f(self, f_df, f_nbow=None):
        logger.info('Computing fact wmds')
        f_nbow = {
            row.Index: self.nbowify(row.Index, row.original)
            for row in f_df.itertuples()
        } if f_nbow is None else f_nbow

        f_calc = WMD(SpacyEmbeddings(self.nlp),
                     f_nbow,
                     vocabulary_min=1,
                     verbosity=logging.WARNING)
        f_calc.cache_centroids()
        f_closest = pd.Series(
            np.array([
                i for i, _ in f_calc.nearest_neighbors(
                    idx, k=self.config.nearest_k_visible)
            ]) for idx in tqdm(f_nbow.keys(), desc='Fact wmd...'))
        return f_closest
 def fit_wme_model(self, d_max=6, r=1024):
     self._r = r
     possible_words = list(self.word_mapping)
     nbow = {}
     for i in range(r):
         d = random.sample(range(1, d_max + 1), 1)[0]
         random_doc = random.sample(possible_words, d)
         doc_embeddings = [self.word_mapping[word] for word in random_doc]
         document, idf_ids = zip(*[(word.glove_id, word.idf_id)
                                   for word in doc_embeddings])
         words = np.array(document, dtype=np.uint32)
         idf_weights = np.array(
             [self.tf_idf_model.idf_[idf_id] for idf_id in idf_ids],
             dtype=np.float32)
         weights = idf_weights
         doc_id = '#' + str(i + 1)
         nbow[doc_id] = (doc_id, words, weights)
     self.wmd = WMD(embeddings=self.glove_model.word_vectors.astype(
         np.float32),
                    nbow=nbow,
                    vocabulary_min=1)
Ejemplo n.º 16
0
    def retrieve(self, top_id: str, k=None, only=None):
        assert only, 'not searching anything'
        index = self.db.mapping
        delta = common.timer()

        def to_nbow(doc_id):
            # transform to the nbow model used by wmd.WMD:
            # ('human readable name', 'item identifiers', 'weights')
            doc = index[doc_id]
            return (doc_id, doc.idx, doc.freq)

        docs = {d: to_nbow(d) for d in only + [top_id]}
        calc = WMDR(self.emb, docs, vocabulary_min=2)

        calc.cache_centroids()
        nn = calc.nearest_neighbors(top_id, k=k)

        self._times.append(delta())

        assert len(nn) == k, f'{len(nn)} not {k}'
        return [Result(*n) for n in nn]
Ejemplo n.º 17
0
def calc_smd(opts, output_f=""):
    inF = open(opts.input_file, 'r')
    inLines = inF.readlines()
    inF.close()
    print("Found", len(inLines), "documents")
    token_doc_list, text_doc_list = tokenize_texts(inLines)
    count = 0
    results_list = []
    for doc_id in range(len(token_doc_list)):
        doc = token_doc_list[doc_id]
        text = text_doc_list[doc_id]
        # transform doc to ID list, both words and/or sentences. get ID dict that maps to emb
        [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text)
        # get D values
        [ref_id_list, hyp_id_list], [ref_d,
                                     hyp_d] = get_weights([ref_ids, hyp_ids])
        # format doc as expected: {id: (id, ref_id_list, ref_d)}
        doc_dict = {
            "0": ("ref", ref_id_list, ref_d),
            "1": ("hyp", hyp_id_list, hyp_d)
        }
        calc = WMD(rep_map, doc_dict, vocabulary_min=1)
        try:
            dist = calc.nearest_neighbors(
                str(0), k=1, early_stop=1)[0][1]  # how far is hyp from ref?
        except:
            print(doc, text)
        sim = math.exp(-dist)  # switch to similarity
        results_list.append(sim)
        if doc_id == int((len(token_doc_list) / 10.) * count):
            print(str(count * 10) + "% done with calculations")
            count += 1
    # added by wchen to compute correlation scores with human annotated scores
    hscoreF = open(opts.score_file, 'r')
    hscoreLines = hscoreF.readlines()
    hscoreF.close()
    compute_corrs(opts, results_list, hscoreLines)
Ejemplo n.º 18
0
def calc_smd(ref, hyp, model):
    global nlp
    nlp = model
    doc, text = tokenize_texts([ref, hyp])
    count = 0
    results_list = []
    # transform doc to ID list, both words and/or sentences. get ID dict that maps to emb
    [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text)
    # get D values
    [ref_id_list, hyp_id_list], [ref_d,
                                 hyp_d] = get_weights([ref_ids, hyp_ids])
    # format doc as expected: {id: (id, ref_id_list, ref_d)}
    doc_dict = {
        "0": ("ref", ref_id_list, ref_d),
        "1": ("hyp", hyp_id_list, hyp_d)
    }
    calc = WMD(rep_map, doc_dict, vocabulary_min=1)
    try:
        dist = calc.nearest_neighbors(
            str(0), k=1, early_stop=1)[0][1]  # how far is hyp from ref?
    except:
        print(doc, text)
    sim = math.exp(-dist)  # switch to similarity
    return sim
Ejemplo n.º 19
0
 def compute_sentence_similarity():
     nlp = spacy.load('en_core_web_sm')
     nlp.add_pipe(WMD.SpacySimilarityHook(nlp), last=True)
     all_score = []
     for i in range(len(all_summary)):
         if len(all_summary[i]) == 1:
             all_score.append([1.0])
             continue
         score = []
         for j in range(1, len(all_summary[i])):
             doc1 = nlp(all_summary[i][j-1])
             doc2 = nlp(all_summary[i][j])
             try:
                 score.append(1.0/(1.0 + math.exp(-doc1.similarity(doc2)+7)))
             except:
                 score.append(1.0)
         all_score.append(score)
     return all_score
Ejemplo n.º 20
0
def SimilarityHook(doc):
    return WMD.SpacySimilarityHook(doc)
class Preprocessor:
    """
    Class Preprocessor implements all necessary operations to prepare raw
    text input for modeling
    """
    WordLemma = namedtuple(
        'WordLemma',
        ['start_char', 'end_char', 'text', 'label_'
         ])  # proxy for a class representing a text span with a label

    WordEmbedding = namedtuple(
        'WordEmbedding', ['idf_id', 'glove_id'])  # necessary for determining

    # correct weights for embedding vectors

    def __init__(self, glove_components=300, min_df=5, max_df=0.4):
        self.glove_model = Glove(no_components=glove_components)
        self.tf_idf_model = TfidfVectorizer(min_df=min_df,
                                            max_df=max_df,
                                            token_pattern='[^\s]+',
                                            lowercase=False)
        self.word_mapping = None
        self.embedding_dim = glove_components
        self.wmd = None
        self._r = None

    def preprocess(self, text: str) -> str:
        raise NotImplementedError

    def sentence_tokenizer(self, text: str) -> List[str]:
        raise NotImplementedError

    def fit_glove(self, sentences, window, epochs):
        corpus = Corpus()
        corpus.fit(sentences, window=window)
        self.glove_model.fit(corpus.matrix, epochs=epochs, no_threads=8)
        self.glove_model.add_dictionary(corpus.dictionary)

    def fit_tf_idf(self, articles):
        self.tf_idf_model.fit(articles)

    def fit(self,
            inputs,
            return_clean=True,
            clean=True,
            window=10,
            epochs=100):
        if clean:
            print('Cleaning {n_inputs} inputs...'.format(n_inputs=len(inputs)),
                  end='')
            clean_inputs = [self.preprocess(input) for input in inputs]
            print('Done!')
        else:
            clean_inputs = inputs[:]
        print('Training Tf-idf model...', end='')
        self.fit_tf_idf(clean_inputs)
        print('Done!')
        sentences_per_input = [
            self.sentence_tokenizer(input) for input in clean_inputs
        ]
        sentences = itertools.chain.from_iterable(sentences_per_input)
        tokenized_sentences = [sentence.split() for sentence in sentences]
        print('Training Glove model...', end='')
        self.fit_glove(tokenized_sentences, window=window, epochs=epochs)
        print('Done!')
        valid_words = set.intersection(
            set(self.glove_model.dictionary.keys()),
            set(self.tf_idf_model.vocabulary_.keys()))
        self.word_mapping = {
            word:
            self.WordEmbedding(glove_id=self.glove_model.dictionary[word],
                               idf_id=self.tf_idf_model.vocabulary_[word])
            for word in valid_words
        }
        if return_clean:
            return clean_inputs

    def article_to_input(self, article):
        tokens = article.split()
        word_embeddings = [
            self.word_mapping[token] for token in tokens
            if token in self.word_mapping
        ]
        weight_ids = [(we.glove_id, we.idf_id) for we in word_embeddings]
        glove_ids, idf_ids = zip(*weight_ids)
        words = np.array(glove_ids, dtype=np.uint32)
        weights = np.array(
            [self.tf_idf_model.idf_[idf_id] for idf_id in idf_ids],
            dtype=np.float32)
        return words, weights

    def _single_embed(self, article, embedding_function, preprocess):
        if preprocess:
            article = self.clean(article)
        try:
            words, weights = self.article_to_input(article)
        except ValueError:
            print('Empty embedding\n\n', article)
            return np.zeros(shape=(self.embedding_dim, ))
        return embedding_function(words, weights)

    def _embed(self, inputs, embedding_function, preprocess):
        if isinstance(inputs, list):
            return np.array([
                self._single_embed(input, embedding_function, preprocess)
                for input in inputs
            ])
        return self._single_embed(inputs, embedding_function, preprocess)

    def _idf_embedding(self, words, weights):
        word_vectors = np.array(
            [self.glove_model.word_vectors[glove_id] for glove_id in words])
        idf_weights = weights / np.sum(weights)
        return np.dot(idf_weights, word_vectors)

    def idf_embed(self, article, preprocess=False):
        return self._embed(article,
                           embedding_function=self._idf_embedding,
                           preprocess=preprocess)

    def fit_wme_model(self, d_max=6, r=1024):
        self._r = r
        possible_words = list(self.word_mapping)
        nbow = {}
        for i in range(r):
            d = random.sample(range(1, d_max + 1), 1)[0]
            random_doc = random.sample(possible_words, d)
            doc_embeddings = [self.word_mapping[word] for word in random_doc]
            document, idf_ids = zip(*[(word.glove_id, word.idf_id)
                                      for word in doc_embeddings])
            words = np.array(document, dtype=np.uint32)
            idf_weights = np.array(
                [self.tf_idf_model.idf_[idf_id] for idf_id in idf_ids],
                dtype=np.float32)
            weights = idf_weights
            doc_id = '#' + str(i + 1)
            nbow[doc_id] = (doc_id, words, weights)
        self.wmd = WMD(embeddings=self.glove_model.word_vectors.astype(
            np.float32),
                       nbow=nbow,
                       vocabulary_min=1)

    def _wme_embedding(self, words, weights, gamma):
        distances = np.array([
            self.wmd._WMD_batch(words, weights, '#' + str(i + 1))
            for i in range(self._r)
        ])
        return 1 / math.sqrt(self._r) * np.exp(-gamma * distances)

    def wme_embed(self, article, preprocess=False, gamma=0.19):
        embedding_function = partial(self._wme_embedding, gamma=gamma)
        return self._embed(article,
                           embedding_function=embedding_function,
                           preprocess=preprocess)
Ejemplo n.º 22
0
class SimilarRepositories:
    GITHUB_URL_RE = re.compile(
        r"(https://|ssh://git@|git://)(github.com/[^/]+/[^/]+)(|.git|/)")
    _log = logging.getLogger("SimilarRepositories")

    def __init__(self,
                 id2vec=None,
                 df=None,
                 nbow=None,
                 prune_df_threshold=1,
                 wmd_cache_centroids=True,
                 wmd_kwargs: Dict[str, Any] = None,
                 languages: Tuple[List, bool] = (None, False),
                 engine_kwargs: Dict[str, Any] = None):
        backend = create_backend()
        if id2vec is None:
            self._id2vec = Id2Vec().load(backend=backend)
        else:
            assert isinstance(id2vec, Id2Vec)
            self._id2vec = id2vec
        self._log.info("Loaded id2vec model: %s", self._id2vec)
        if df is None:
            if df is not False:
                self._df = DocumentFrequencies().load(backend=backend)
            else:
                self._df = None
                self._log.warning("Disabled document frequencies - you will "
                                  "not be able to query custom repositories.")
        else:
            assert isinstance(df, DocumentFrequencies)
            self._df = df
        if self._df is not None:
            self._df = self._df.prune(prune_df_threshold)
        self._log.info("Loaded document frequencies: %s", self._df)
        if nbow is None:
            self._bow = BOW().load(backend=backend)
        else:
            assert isinstance(nbow, BOW)
            self._bow = nbow
        self._log.info("Loaded BOW model: %s", self._bow)
        assert self._bow.get_dep("id2vec")["uuid"] == self._id2vec.meta["uuid"]
        if len(self._id2vec) != self._bow.matrix.shape[1]:
            raise ValueError(
                "Models do not match: id2vec has %s tokens while nbow has %s" %
                (len(self._id2vec), self._bow.matrix.shape[1]))
        self._log.info("Creating the WMD engine...")
        self._wmd = WMD(self._id2vec.embeddings, self._bow, **(wmd_kwargs
                                                               or {}))
        if wmd_cache_centroids:
            self._wmd.cache_centroids()
        self._languages = languages
        self._engine_kwargs = engine_kwargs

    def query(self, url_or_path_or_name: str,
              **kwargs) -> List[Tuple[str, float]]:
        try:
            repo_index = self._bow.documents.index(url_or_path_or_name)
        except ValueError:
            repo_index = -1
        if repo_index == -1:
            match = self.GITHUB_URL_RE.match(url_or_path_or_name)
            if match is not None:
                name = match.group(2)
                try:
                    repo_index = self._bow.documents.index(name)
                except ValueError:
                    pass
        if repo_index >= 0:
            neighbours = self._query_domestic(repo_index, **kwargs)
        else:
            neighbours = self._query_foreign(url_or_path_or_name, **kwargs)
        neighbours = [(self._bow[n[0]][0], n[1]) for n in neighbours]
        return neighbours

    def _query_domestic(self, repo_index, **kwargs):
        return self._wmd.nearest_neighbors(repo_index, **kwargs)

    def _query_foreign(self, url_or_path: str, **kwargs):
        df = self._df
        if df is None:
            raise ValueError("Cannot query custom repositories if the "
                             "document frequencies are disabled.")

        with tempfile.TemporaryDirectory(prefix="vecino-") as tempdir:
            target = os.path.join(tempdir, "repo")
            if os.path.isdir(url_or_path):
                url_or_path = os.path.abspath(url_or_path)
                os.symlink(url_or_path, target, target_is_directory=True)
                repo_format = "standard"
            else:
                self._log.info("Cloning %s to %s", url_or_path, target)
                porcelain.clone(url_or_path,
                                target,
                                bare=True,
                                outstream=sys.stderr)
                repo_format = "bare"
            bow = repo2bow(tempdir,
                           repo_format,
                           1,
                           df,
                           *self._languages,
                           engine_kwargs=self._engine_kwargs)
        ibow = {}
        for key, val in bow.items():
            try:
                ibow[self._id2vec[key]] = val
            except KeyError:
                continue
        words, weights = zip(*sorted(ibow.items()))
        return self._wmd.nearest_neighbors((words, weights), **kwargs)
# coding: utf-8

# pip3 install wmd
# https://github.com/src-d/wmd-relax
# 论文: http://www.cs.cornell.edu/~kilian/papers/wmd_metric.pdf

import time
import numpy
from wmd import WMD
import pickle
embeddings = numpy.array([[0.1, 1], [1, 0.1], [0.8, 0.7]], dtype=numpy.float32)
nbow = {  # key: 序号, 向量, 权重;
        "first":  ("#1", [0, 1, 2], numpy.array([1.5, 0.3, 0.5], dtype=numpy.float32)),
        "你好":  ("#3", [1, 2], numpy.array([1.3, 0.5], dtype=numpy.float32)),
        "second": ("#2", [0, 1], numpy.array([0.75, 0.15], dtype=numpy.float32))}
calc = WMD(embeddings, nbow, vocabulary_min=2)
origin = "first"
print(calc.nearest_neighbors(origin))

model_file = '/home/gswewf/yhb/model/wx_vector_char.pkl'

with open(model_file, "rb")as f:
    w2v_model = pickle.load(f, encoding='iso-8859-1')  # 此处耗内存 60.8 MiB

words_list = []
w_emb = []
for word, emb in w2v_model.items():
    words_list.append(word)
    w_emb.append(emb)

from jieba.analyse.tfidf import TFIDF
Ejemplo n.º 24
0
# pip3 install wmd
# https://github.com/src-d/wmd-relax
# 论文: http://www.cs.cornell.edu/~kilian/papers/wmd_metric.pdf

import time
import numpy
from wmd import WMD
import pickle
embeddings = numpy.array([[0.1, 1], [1, 0.1], [0.8, 0.7]], dtype=numpy.float32)
nbow = {  # key: 序号, 向量, 权重;
    "first": ("#1", [0, 1, 2], numpy.array([1.5, 0.3, 0.5],
                                           dtype=numpy.float32)),
    "你好": ("#3", [1, 2], numpy.array([1.3, 0.5], dtype=numpy.float32)),
    "second": ("#2", [0, 1], numpy.array([0.75, 0.15], dtype=numpy.float32))
}
calc = WMD(embeddings, nbow, vocabulary_min=2)
origin = "first"
print(calc.nearest_neighbors(origin))

model_file = '/home/gswewf/yhb/model/wx_vector_char.pkl'

with open(model_file, "rb") as f:
    w2v_model = pickle.load(f, encoding='iso-8859-1')  # 此处耗内存 60.8 MiB

words_list = []
w_emb = []
for word, emb in w2v_model.items():
    words_list.append(word)
    w_emb.append(emb)

from jieba.analyse.tfidf import TFIDF
Ejemplo n.º 25
0
import pandas as pd
import re
import glob
import sys
sys.path.append("./BERT/pytorch-pretrained-BERT-master")
sys.path.append("./BERT")
from pytorch_pretrained_bert import BertTokenizer, BertModel
from wmd import WMD
from torch.nn.modules.distance import CosineSimilarity

torch_emb_sim = CosineSimilarity()

from bert_score import score as bert_score

nlp = spacy.load('en_core_web_md')
nlp.add_pipe(WMD.SpacySimilarityHook(nlp), last=True)


def _clean_text(txt):
    return txt.lower()


class CFRInstance(object):
    def __init__(
        self,
        original_context: str,
        cf_context: str,
        original_ending: str,
        predicted_ending: str,
        gold_cf_endings: List[str],
    ):
    text = nlp(doctext)
    tokens = [t for t in text if t.is_alpha and not t.is_stop]

    words = Counter(t.text for t in tokens)
    orths = {t.text: t.orth for t in tokens}
    sorted_words = sorted(words)
    documents[title] = (title, [orths[t] for t in sorted_words],
                        numpy.array([words[t] for t in sorted_words],
                                    dtype=numpy.float32))


# Hook in WMD
class SpacyEmbeddings(object):
    def __getitem__(self, item):
        return nlp.vocab[item].vector


embeddings = SpacyEmbeddings()

vocabulary_min = 10
calc = WMD(embeddings, documents, vocabulary_min=vocabulary_min)

print("calculating")
# Germany shall be closer to Spain than to Google

neigbors_of_germany = calc.nearest_neighbors(titles[0])

for title, relevance in neigbors_of_germany:
    print("%24s\t%s" % (title, relevance))
Ejemplo n.º 27
0
# List of page names we will fetch from Wikipedia and query for similarity
titles = sys.argv[1:] or ["Germany", "Spain", "Google"]

documents = {}
for title in titles:
    print("fetching", title)
    pages = requests.get(
        "https://en.wikipedia.org/w/api.php?action=query&format=json&titles=%s"
        "&prop=extracts&explaintext" % title).json()["query"]["pages"]
    print("parsing", title)
    text = nlp(next(iter(pages.values()))["extract"])
    tokens = [t for t in text if t.is_alpha and not t.is_stop]
    words = Counter(t.text for t in tokens)
    orths = {t.text: t.orth for t in tokens}
    sorted_words = sorted(words)
    documents[title] = (title, [orths[t] for t in sorted_words],
                        numpy.array([words[t] for t in sorted_words],
                                    dtype=numpy.float32))


# Hook in WMD
class SpacyEmbeddings(object):
    def __getitem__(self, item):
        return nlp.vocab[item].vector

calc = WMD(SpacyEmbeddings(), documents)
print("calculating")
# Germany shall be closer to Spain than to Google
for title, relevance in calc.nearest_neighbors(titles[0]):
    print("%24s\t%s" % (title, relevance))
Ejemplo n.º 28
0
class SimilarRepositories:
    GITHUB_URL_RE = re.compile(
        r"(https://|ssh://git@|git://)(github.com/[^/]+/[^/]+)(|.git|/)")

    def __init__(self,
                 id2vec=None,
                 df=None,
                 nbow=None,
                 prune_df_threshold=1,
                 verbosity=logging.DEBUG,
                 wmd_cache_centroids=True,
                 wmd_kwargs=None,
                 gcs_bucket=None,
                 repo2nbow_kwargs=None,
                 initialize_environment=True):
        if initialize_environment:
            initialize()
        self._log = logging.getLogger("similar_repos")
        self._log.setLevel(verbosity)
        if gcs_bucket:
            backend = create_backend(args="bucket=" + gcs_bucket)
        else:
            backend = create_backend()
        if id2vec is None:
            self._id2vec = Id2Vec(log_level=verbosity).load(backend=backend)
        else:
            assert isinstance(id2vec, Id2Vec)
            self._id2vec = id2vec
        self._log.info("Loaded id2vec model: %s", self._id2vec)
        if df is None:
            if df is not False:
                self._df = DocumentFrequencies(log_level=verbosity).load(
                    backend=backend)
            else:
                self._df = None
                self._log.warning("Disabled document frequencies - you will "
                                  "not be able to query custom repositories.")
        else:
            assert isinstance(df, DocumentFrequencies)
            self._df = df
        if self._df is not None:
            self._df = self._df.prune(prune_df_threshold)
        self._log.info("Loaded document frequencies: %s", self._df)
        if nbow is None:
            self._nbow = NBOW(log_level=verbosity).load(backend=backend)
        else:
            assert isinstance(nbow, NBOW)
            self._nbow = nbow
        self._log.info("Loaded nBOW model: %s", self._nbow)
        self._repo2nbow = Repo2nBOW(self._id2vec,
                                    self._df,
                                    log_level=verbosity,
                                    **(repo2nbow_kwargs or {}))
        assert self._nbow.get_dependency(
            "id2vec")["uuid"] == self._id2vec.meta["uuid"]
        if len(self._id2vec) != self._nbow.matrix.shape[1]:
            raise ValueError(
                "Models do not match: id2vec has %s tokens while nbow has %s" %
                (len(self._id2vec), self._nbow.matrix.shape[1]))
        self._log.info("Creating the WMD engine...")
        self._wmd = WMD(self._id2vec.embeddings,
                        self._nbow,
                        verbosity=verbosity,
                        **(wmd_kwargs or {}))
        if wmd_cache_centroids:
            self._wmd.cache_centroids()

    def query(self, url_or_path_or_name, **kwargs):
        try:
            repo_index = self._nbow.repository_index_by_name(
                url_or_path_or_name)
        except KeyError:
            repo_index = -1
        if repo_index == -1:
            match = self.GITHUB_URL_RE.match(url_or_path_or_name)
            if match is not None:
                name = match.group(2)
                try:
                    repo_index = self._nbow.repository_index_by_name(name)
                except KeyError:
                    pass
        if repo_index >= 0:
            neighbours = self._query_domestic(repo_index, **kwargs)
        else:
            neighbours = self._query_foreign(url_or_path_or_name, **kwargs)
        neighbours = [(self._nbow[n[0]][0], n[1]) for n in neighbours]
        return neighbours

    @staticmethod
    def unicorn_query(repo_name,
                      id2vec=None,
                      nbow=None,
                      wmd_kwargs=None,
                      query_wmd_kwargs=None):
        sr = SimilarRepositories(id2vec=id2vec,
                                 df=False,
                                 nbow=nbow,
                                 wmd_kwargs=wmd_kwargs or {
                                     "vocabulary_min": 50,
                                     "vocabulary_max": 500
                                 })
        return sr.query(
            repo_name,
            **(query_wmd_kwargs or {
                "early_stop": 0.1,
                "max_time": 180,
                "skipped_stop": 0.95
            }))

    def _query_domestic(self, repo_index, **kwargs):
        return self._wmd.nearest_neighbors(repo_index, **kwargs)

    def _query_foreign(self, url_or_path, **kwargs):
        if self._df is None:
            raise ValueError("Cannot query custom repositories if the "
                             "document frequencies are disabled.")
        nbow_dict = self._repo2nbow.convert_repository(url_or_path)
        words = sorted(nbow_dict.keys())
        weights = [nbow_dict[k] for k in words]
        return self._wmd.nearest_neighbors((words, weights), **kwargs)