예제 #1
0
    def googlenews(allowed_str):
        # Word2vec (GoogleNews):
        #   non-normalized.
        #   unordered, from gensim's dict-like structure.
        an_w = an.load(fnames[2], verbosity=1)
        if an_w is not None:
            an_w.add_evaluators(get_e())
            an_w.analysis(print_report=False)
            an_w.save()
        else:
            import gensim

            model_w = gensim.models.KeyedVectors.load_word2vec_format(
                "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/"
                "GoogleNews-vectors-negative300.bin",
                binary=True)
            #common_w = list(filter(lambda w: w in model_w.vocab.keys() \
            #    or bytes(w) in model_w.vocab.keys(), allowed_str))
            common_w = [w for w in allowed_str if w in model_w.vocab.keys()]
            embed_w = [model_w.get_vector(w) for w in common_w]
            an_w = an.Analyst(embeddings=embed_w,
                              strings=common_w,
                              metric=metric,
                              auto_print=printing,
                              desc="GoogleNews",
                              parallel_count=cpus,
                              evaluators=get_e(),
                              auto_save=2,
                              file_name=fnames[2],
                              over_write=True)
예제 #2
0
 def deps(allowed_str):
     # Dependency-Based Word Embeddings:
     #   appears to be ordered by frequency.
     #   Normalized.
     a = an.load(fnames[8], verbosity=1)
     if a is not None:
         a.add_evaluators(get_e())
         a.analysis(print_report=False)
         a.save()
     else:
         strings, embed_g = read_text_table(
             "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/"
             "dependency_based_word_embeddings/deps.words",
             firstline=False,
             limit_lines=MAX_LINES)
         common = [w for w in allowed_str if w in strings]
         indeces = [strings.index(w) for w in common]
         embed_g = embed_g[indeces]
         a = an.Analyst(embeddings=embed_g,
                        strings=common,
                        metric=metric,
                        auto_print=printing,
                        desc="DEPS",
                        parallel_count=cpus,
                        evaluators=get_e(),
                        auto_save=2,
                        file_name=fnames[8],
                        over_write=True)
예제 #3
0
 def glove(allowed_str):
     # GloVe:
     #   ordered by frequency.
     #   non-normalized.
     an_g = an.load(fnames[3], verbosity=1)
     if an_g is not None:
         an_g.add_evaluators(get_e())
         an_g.analysis(print_report=False)
         an_g.save()
     else:
         str_g, embed_g = read_text_table(
             "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/"
             "glove.6B.300d.txt",
             firstline=False,
             limit_lines=MAX_LINES)
         #embed_g = [normalize(v) for v in embed_g]
         common = [w for w in allowed_str if w in str_g]
         indeces = [str_g.index(w) for w in common]
         embed_g = embed_g[indeces]
         an_g = an.Analyst(embeddings=embed_g,
                           strings=common,
                           metric=metric,
                           auto_print=printing,
                           desc="GloVe",
                           parallel_count=cpus,
                           evaluators=get_e(),
                           auto_save=2,
                           file_name=fnames[3],
                           over_write=True)
예제 #4
0
 def numberbatch(allowed_str):
     # ConceptNet Numberbatch:
     #   alphanumeric order.
     #   normalized.
     #if not os.path.isfile("embeddings/an_numberbatch"):
     an_nb = an.load(fnames[1], verbosity=1)
     if an_nb is not None:
         an_nb.add_evaluators(get_e())
         an_nb.analysis(print_report=False)
         an_nb.save()
     else:
         str_nb, embed_nb = read_text_table(
             "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/"
             "numberbatch-en-17.06.txt",
             firstline=True)
         common_nb = [w for w in allowed_str if w in str_nb]
         indeces_nb = [str_nb.index(w) for w in common_nb]
         #embed_nb = np.array([embed_nb[i] for i in indeces_nb])
         embed_nb = embed_nb[indeces_nb]
         an_nb = an.Analyst(embeddings=embed_nb,
                            strings=common_nb,
                            metric=metric,
                            auto_print=printing,
                            parallel_count=cpus,
                            desc="ConceptNet Numberbatch",
                            evaluators=get_e(),
                            auto_save=2,
                            file_name=fnames[1],
                            over_write=True)
예제 #5
0
 def fasttext(allowed_str):
     # Fasttext:
     #   ordered by frequency.
     #   non-normalized.
     an_fnc = an.load(fnames[0], verbosity=1)
     if an_fnc is not None:
         an_fnc.add_evaluators(get_e())  # + get_e_freq())
         an_fnc.analysis(print_report=False)
         an_fnc.save()
     else:
         with open(
                 "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/"
                 "fasttext.en.py2.pkl", 'rb') as f:
             data_ft = pkl.load(f)
         str_f = data_ft['tokens'][:MAX_LINES]
         str_f = list(map(str, str_f))
         embed_f = data_ft['vectors'][:MAX_LINES]
         #embed_fn = np.array([normalize(v) for v in embed_f])
         common = [w for w in allowed_str if w in str_f]
         indeces = [str_f.index(w) for w in common]
         embed_f = embed_f[indeces]
         an_fnc = an.Analyst(embeddings=embed_f,
                             strings=common,
                             auto_print=printing,
                             metric=metric,
                             desc="Fasttext",
                             evaluators=get_e(),
                             auto_save=2,
                             file_name=fnames[0],
                             over_write=True,
                             parallel_count=cpus)  # + get_e_freq())
예제 #6
0
    def sense_2_vec(allowed_str):
        # Sense2Vec:
        #   originally from reddit, then through sense2vec, I modify sense2vec
        #   by doing a weighted average of all the parts of speech of each word
        #   I seek, since they are often close in the space.
        #   NOT normalized.
        #   128 dimensions.

        a = an.load(fnames[4], verbosity=1)
        if a is not None:
            a.add_evaluators(get_e())
            a.analysis(print_report=False)
            a.save()
        else:
            import sense2vec

            s2v = sense2vec.load('/mnt/pccfs/not_backed_up/nate/'
                                 'analyst_embeddings/reddit_vectors-1.1.0/')
            strings = []
            vectors = []
            endings = [
                '|ADJ', '|ADP', '|ADV', '|AUX', '|CONJ', '|DET', '|INTJ',
                '|NOUN', '|NUM', '|PART', '|PRON', '|PROPN', '|PUNCT',
                '|SCONJ', '|SYM', '|VERB', '|X'
            ]
            for s in allowed_str:
                senses = []
                freq_sum = 0
                for e in endings:
                    try:
                        t = s2v[s + e]
                        senses.append(t[1] * t[0])
                        freq_sum += t[0]
                    except:
                        pass
                if len(senses) > 0:
                    strings.append(s)
                    vectors.append(np.sum(senses, axis=0) / freq_sum)
            a = an.Analyst(embeddings=np.array(vectors),
                           strings=strings,
                           metric=metric,
                           auto_print=printing,
                           desc="Sense2Vec",
                           parallel_count=cpus,
                           evaluators=get_e(),
                           auto_save=2,
                           file_name=fnames[4],
                           over_write=True)
예제 #7
0
    def use_large(allowed_str):
        # Universal Sentence Encoder:
        #   embeddings must be found by hand from things to encode.
        #   normalized.
        #   512 dimensions.
        an_u = an.load(fnames[7], verbosity=1)
        if an_u is not None:
            an_u.add_evaluators(get_e())
            an_u.analysis(print_report=False)
            an_u.save()
        else:
            import tensorflow as tf
            import tensorflow_hub as hub

            module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
            embed = hub.Module(module_url)
            tf.logging.set_verbosity(tf.logging.ERROR)
            batches = [
                allowed_str[b:b + 10000]
                for b in range(0, len(allowed_str), 10000)
            ]
            embeddings = []
            with tf.Session() as sess:
                sess.run([
                    tf.global_variables_initializer(),
                    tf.tables_initializer()
                ])
                for b in batches:
                    embeddings.append(sess.run(embed(b)))
            embeddings = np.vstack(embeddings)
            an_u = an.Analyst(embeddings=embeddings,
                              strings=allowed_str,
                              metric=metric,
                              auto_print=printing,
                              desc="USE Large",
                              parallel_count=cpus,
                              evaluators=get_e(),
                              auto_save=2,
                              file_name=fnames[7],
                              over_write=True)
예제 #8
0
    def use_lite(allowed_str):
        # Universal Sentence Encoder:
        #   embeddings must be found by hand from things to encode.
        #   normalized.
        #   512 dimensions.
        an_u = an.load(fnames[6], verbosity=1)
        if an_u is not None:
            an_u.add_evaluators(get_e())
            an_u.analysis(print_report=False)
            an_u.save()
        else:
            import tensorflow as tf
            import tensorflow_hub as hub
            import sentencepiece as spm

            def process_to_IDs_in_sparse_format(sp, sentences):
                # An utility method that processes sentences with the sentence piece processor
                # 'sp' and returns the results in tf.SparseTensor-similar format:
                # (values, indices, dense_shape)
                ids = [sp.EncodeAsIds(x) for x in sentences]
                max_len = max(len(x) for x in ids)
                dense_shape = (len(ids), max_len)
                values = [item for sublist in ids for item in sublist]
                indices = [[row, col] for row in range(len(ids))
                           for col in range(len(ids[row]))]
                return (values, indices, dense_shape)

            with tf.Session() as sess:
                module = hub.Module(
                    "https://tfhub.dev/google/universal-sentence-encoder-lite/2"
                )
                spm_path = sess.run(module(signature="spm_path"))
                # spm_path now contains a path to the SentencePiece model stored inside the
                # TF-Hub module

                sp = spm.SentencePieceProcessor()
                sp.Load(spm_path)

                input_placeholder = tf.sparse_placeholder(tf.int64,
                                                          shape=[None, None])
                embedder = module(
                    inputs=dict(values=input_placeholder.values,
                                indices=input_placeholder.indices,
                                dense_shape=input_placeholder.dense_shape))

                sess.run([
                    tf.global_variables_initializer(),
                    tf.tables_initializer()
                ])

                batches = [
                    allowed_str[b:b + 10000]
                    for b in range(0, len(allowed_str), 10000)
                ]
                embeddings = []
                for b in batches:
                    values, indices, dense_shape = process_to_IDs_in_sparse_format(
                        sp, b)
                    embeddings.append(
                        sess.run(embedder,
                                 feed_dict={
                                     input_placeholder.values: values,
                                     input_placeholder.indices: indices,
                                     input_placeholder.dense_shape: dense_shape
                                 }))
                embeddings = np.vstack(embeddings)

                an_u = an.Analyst(embeddings=embeddings,
                                  strings=allowed_str,
                                  metric=metric,
                                  auto_print=printing,
                                  desc="USE Lite",
                                  parallel_count=cpus,
                                  evaluators=get_e(),
                                  auto_save=2,
                                  file_name=fnames[6],
                                  over_write=True)
예제 #9
0
#         numvecs = len(lines) if limit_lines == None \
#             else min(len(lines), limit_lines)
#         dim = len(lines[0].split(" ")) - 1
#     strings = []
#     embeddings = np.empty(shape=(numvecs, dim))
#     for i in tqdm(range(numvecs), desc="Reading " + path):
#         row = lines[i + firstline].split(" ")
#         strings.append(row[0])#str(row[0]))
#         embeddings[i] = row[1:]
#     return strings, embeddings

# def get_strings():
#     with open("embeddings/fasttext.en.py2.pkl", 'rb') as f:
#         data_ft = pkl.load(f)
#         str_f = data_ft['tokens'][:MAX_LINES]
#         return data_ft, list(map(str, str_f))


if __name__ == "__main__":

    #data_ft, str_f = get_strings()
    #fasttext(str_f, data_ft)
    #word2vec_analysis()

    a=an.load("Word2Vec Canonical Test.dill")
    #a.add_evaluators(get_e2())
    # for e in a.evaluators:
    #     if len(e.stats_dict) == 0 and "file_name" in dir(e):
    #         e.file_name = e.file_name[:53] + "text/" + e.file_name[53:]
    a.analysis()
예제 #10
0
    #assert len(set([str(unique_pts[i]) for i in range(len(unique_pts))])) == len(unique_pts)
    #print("asserted uniqueness of vectors")

    an_ccc = an.Analyst(
        embeddings=unique_pts[:MAX_LINES],
        strings=unique_lines[:MAX_LINES],
        metric=metric,
        auto_print=True,
        desc="ChitChatChallenge Utterance Hubs",
        #evaluators=["Nodal 4-Hubs"],
        calculate=True)

    print("Success at saving ChitChatChallenge Utterance Hubs: " +
          str(an.Analyst.save(an_ccc, filename)))

    a = an.load(filename)

    hubber = a.find_evaluator("Nodal 4-Hubs")
    hubs = hubber.get_clusters()
    sizes = [len(h) for h in hubs]
    order = np.argsort(sizes)[::-1]
    #order = np.argsort([h.stats_dict["Dispersion"] for h in hubs])
    hubs = np.array(hubs)[order]  #.tolist()

    #print(np.array(sizes)[order])

    print("Number of Utterances:", len(a.strings))
    print("Number of Hubs:", len(hubs))
    #"""
    for i, h in enumerate(hubs):
        print("")
예제 #11
0
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/1"
    embed = hub.Module(module_url)
    tf.logging.set_verbosity(tf.logging.ERROR)
    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        embed_u = sess.run(embed(str_f))
    an_u = an.Analyst(embeddings=embed_u, strings=str_f, metric=metric,
        auto_print=False, desc="Universal Sentence Encoder")
    print("Success at saving Universal Sentence Encoder: " +
        str(an.Analyst.save(
            an_u, "saved_analyses/an" + str(MAX_LINES) +
            "_universal_sentence_encoder")))

    #messagebox.showinfo("Information","Analysis 5 complete!")'''
    """
    an_fnc = an.load("saved_analyses/an" + str(MAX_LINES) +
                     "_fasttext_normalized")
    an_nb = an.load("saved_analyses/an" + str(MAX_LINES) + "_numberbatch")
    an_w = an.load("saved_analyses/an" + str(MAX_LINES) +
                   "_googlenews_normalized")
    an_g = an.load("saved_analyses/an" + str(MAX_LINES) + "_glove_normalized")
    an_u = an.load("saved_analyses/an" + str(MAX_LINES) +
                   "_universal_sentence_encoder")

    #an.Analyst.compare([an_fnc, an_fe, an_fne, an_fc])
    #an.Analyst.compare([an_w, an_fnc, an_g, an_nb, an_u])

    #an.Analyst.graph_comparison([an_w, an_fnc, an_g, an_nb, an_u], "Nodes", "Count")
    an.Analyst.graph_multi([an_w, an_fnc, an_g, an_nb, an_u],
                           [("Nodes", "Count"), ("Nuclei", "Count"),
                            ("Nodal 4-Hubs", "Count")],
                           group_by_stat=False)