Beispiel #1
0
    def googlenews(allowed_str):
        # Word2vec (GoogleNews):
        #   non-normalized.
        #   unordered, from gensim's dict-like structure.
        an_w = an.load(fnames[2], verbosity=1)
        if an_w is not None:
            an_w.add_evaluators(get_e())
            an_w.analysis(print_report=False)
            an_w.save()
        else:
            import gensim

            model_w = gensim.models.KeyedVectors.load_word2vec_format(
                "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/"
                "GoogleNews-vectors-negative300.bin",
                binary=True)
            #common_w = list(filter(lambda w: w in model_w.vocab.keys() \
            #    or bytes(w) in model_w.vocab.keys(), allowed_str))
            common_w = [w for w in allowed_str if w in model_w.vocab.keys()]
            embed_w = [model_w.get_vector(w) for w in common_w]
            an_w = an.Analyst(embeddings=embed_w,
                              strings=common_w,
                              metric=metric,
                              auto_print=printing,
                              desc="GoogleNews",
                              parallel_count=cpus,
                              evaluators=get_e(),
                              auto_save=2,
                              file_name=fnames[2],
                              over_write=True)
Beispiel #2
0
 def glove(allowed_str):
     # GloVe:
     #   ordered by frequency.
     #   non-normalized.
     an_g = an.load(fnames[3], verbosity=1)
     if an_g is not None:
         an_g.add_evaluators(get_e())
         an_g.analysis(print_report=False)
         an_g.save()
     else:
         str_g, embed_g = read_text_table(
             "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/"
             "glove.6B.300d.txt",
             firstline=False,
             limit_lines=MAX_LINES)
         #embed_g = [normalize(v) for v in embed_g]
         common = [w for w in allowed_str if w in str_g]
         indeces = [str_g.index(w) for w in common]
         embed_g = embed_g[indeces]
         an_g = an.Analyst(embeddings=embed_g,
                           strings=common,
                           metric=metric,
                           auto_print=printing,
                           desc="GloVe",
                           parallel_count=cpus,
                           evaluators=get_e(),
                           auto_save=2,
                           file_name=fnames[3],
                           over_write=True)
Beispiel #3
0
 def fasttext(allowed_str):
     # Fasttext:
     #   ordered by frequency.
     #   non-normalized.
     an_fnc = an.load(fnames[0], verbosity=1)
     if an_fnc is not None:
         an_fnc.add_evaluators(get_e())  # + get_e_freq())
         an_fnc.analysis(print_report=False)
         an_fnc.save()
     else:
         with open(
                 "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/"
                 "fasttext.en.py2.pkl", 'rb') as f:
             data_ft = pkl.load(f)
         str_f = data_ft['tokens'][:MAX_LINES]
         str_f = list(map(str, str_f))
         embed_f = data_ft['vectors'][:MAX_LINES]
         #embed_fn = np.array([normalize(v) for v in embed_f])
         common = [w for w in allowed_str if w in str_f]
         indeces = [str_f.index(w) for w in common]
         embed_f = embed_f[indeces]
         an_fnc = an.Analyst(embeddings=embed_f,
                             strings=common,
                             auto_print=printing,
                             metric=metric,
                             desc="Fasttext",
                             evaluators=get_e(),
                             auto_save=2,
                             file_name=fnames[0],
                             over_write=True,
                             parallel_count=cpus)  # + get_e_freq())
Beispiel #4
0
 def numberbatch(allowed_str):
     # ConceptNet Numberbatch:
     #   alphanumeric order.
     #   normalized.
     #if not os.path.isfile("embeddings/an_numberbatch"):
     an_nb = an.load(fnames[1], verbosity=1)
     if an_nb is not None:
         an_nb.add_evaluators(get_e())
         an_nb.analysis(print_report=False)
         an_nb.save()
     else:
         str_nb, embed_nb = read_text_table(
             "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/"
             "numberbatch-en-17.06.txt",
             firstline=True)
         common_nb = [w for w in allowed_str if w in str_nb]
         indeces_nb = [str_nb.index(w) for w in common_nb]
         #embed_nb = np.array([embed_nb[i] for i in indeces_nb])
         embed_nb = embed_nb[indeces_nb]
         an_nb = an.Analyst(embeddings=embed_nb,
                            strings=common_nb,
                            metric=metric,
                            auto_print=printing,
                            parallel_count=cpus,
                            desc="ConceptNet Numberbatch",
                            evaluators=get_e(),
                            auto_save=2,
                            file_name=fnames[1],
                            over_write=True)
Beispiel #5
0
 def deps(allowed_str):
     # Dependency-Based Word Embeddings:
     #   appears to be ordered by frequency.
     #   Normalized.
     a = an.load(fnames[8], verbosity=1)
     if a is not None:
         a.add_evaluators(get_e())
         a.analysis(print_report=False)
         a.save()
     else:
         strings, embed_g = read_text_table(
             "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/"
             "dependency_based_word_embeddings/deps.words",
             firstline=False,
             limit_lines=MAX_LINES)
         common = [w for w in allowed_str if w in strings]
         indeces = [strings.index(w) for w in common]
         embed_g = embed_g[indeces]
         a = an.Analyst(embeddings=embed_g,
                        strings=common,
                        metric=metric,
                        auto_print=printing,
                        desc="DEPS",
                        parallel_count=cpus,
                        evaluators=get_e(),
                        auto_save=2,
                        file_name=fnames[8],
                        over_write=True)
Beispiel #6
0
def word2vec_analysis():
    return an.Analyst(
        embeddings=None,
        strings=model.vocab,
        encoder=model.__getitem__,
        auto_print=True,
        metric=metric,
        desc="Word2Vec Canonical Test",
        evaluators=get_e(),
        auto_save=True, # Careful! End up saving twice if you forget...
        over_write=True,
    )
Beispiel #7
0
    def sense_2_vec(allowed_str):
        # Sense2Vec:
        #   originally from reddit, then through sense2vec, I modify sense2vec
        #   by doing a weighted average of all the parts of speech of each word
        #   I seek, since they are often close in the space.
        #   NOT normalized.
        #   128 dimensions.

        a = an.load(fnames[4], verbosity=1)
        if a is not None:
            a.add_evaluators(get_e())
            a.analysis(print_report=False)
            a.save()
        else:
            import sense2vec

            s2v = sense2vec.load('/mnt/pccfs/not_backed_up/nate/'
                                 'analyst_embeddings/reddit_vectors-1.1.0/')
            strings = []
            vectors = []
            endings = [
                '|ADJ', '|ADP', '|ADV', '|AUX', '|CONJ', '|DET', '|INTJ',
                '|NOUN', '|NUM', '|PART', '|PRON', '|PROPN', '|PUNCT',
                '|SCONJ', '|SYM', '|VERB', '|X'
            ]
            for s in allowed_str:
                senses = []
                freq_sum = 0
                for e in endings:
                    try:
                        t = s2v[s + e]
                        senses.append(t[1] * t[0])
                        freq_sum += t[0]
                    except:
                        pass
                if len(senses) > 0:
                    strings.append(s)
                    vectors.append(np.sum(senses, axis=0) / freq_sum)
            a = an.Analyst(embeddings=np.array(vectors),
                           strings=strings,
                           metric=metric,
                           auto_print=printing,
                           desc="Sense2Vec",
                           parallel_count=cpus,
                           evaluators=get_e(),
                           auto_save=2,
                           file_name=fnames[4],
                           over_write=True)
Beispiel #8
0
def run_analyst(lines, pts, tag=TAG, save=True):
    print("Analyzing space...")
    a = an.Analyst(embeddings=pts[:MAX_LINES],
                   strings=lines[:MAX_LINES],
                   metric=METRIC,
                   auto_print=True,
                   desc=tag + "_" + str(len(lines)),
                   evaluators=[CLUSTERS_TYPE],
                   calculate=True)

    if save:
        # Save a copy of the analyst:
        analyst_file = "experiments/" + tag + "_" + str(len(a.strings)) \
            + "_analyst"
        print("Success at pickling utterance clusters Analyst: " +
              str(an.Analyst.save(a, analyst_file)))

    return a
Beispiel #9
0
    def use_large(allowed_str):
        # Universal Sentence Encoder:
        #   embeddings must be found by hand from things to encode.
        #   normalized.
        #   512 dimensions.
        an_u = an.load(fnames[7], verbosity=1)
        if an_u is not None:
            an_u.add_evaluators(get_e())
            an_u.analysis(print_report=False)
            an_u.save()
        else:
            import tensorflow as tf
            import tensorflow_hub as hub

            module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
            embed = hub.Module(module_url)
            tf.logging.set_verbosity(tf.logging.ERROR)
            batches = [
                allowed_str[b:b + 10000]
                for b in range(0, len(allowed_str), 10000)
            ]
            embeddings = []
            with tf.Session() as sess:
                sess.run([
                    tf.global_variables_initializer(),
                    tf.tables_initializer()
                ])
                for b in batches:
                    embeddings.append(sess.run(embed(b)))
            embeddings = np.vstack(embeddings)
            an_u = an.Analyst(embeddings=embeddings,
                              strings=allowed_str,
                              metric=metric,
                              auto_print=printing,
                              desc="USE Large",
                              parallel_count=cpus,
                              evaluators=get_e(),
                              auto_save=2,
                              file_name=fnames[7],
                              over_write=True)
Beispiel #10
0
def run_analyst(lines, pts, tag=TAG, save=True):
    print("Analyzing space...")

    nucleizer = an.evaluators.nucleus_clusterizer.NucleusClusterizer(
        hub_category=u"Nodal " + str(HUB_THRESHOLD) + u"-Hubs")

    a = an.Analyst(
        embeddings=pts[:MAX_LINES],
        strings=lines[:MAX_LINES],
        metric=METRIC,
        auto_print=True,
        desc=tag + "_" + str(len(lines)),
        evaluators=[nucleizer],
    )

    if save:
        # Save a copy of the analyst:
        analyst_file = "experiments/" + tag + "_" + str(len(a.strings)) \
            + "_analyst"
        print("Success at pickling utterance clusters Analyst: " +
            str(an.Analyst.save(a, analyst_file)))

    return a
Beispiel #11
0
    l = set()
    for i, line in enumerate(lines):
        if line not in l:
            l.add(line)
            unique_lines.append(line)
            unique_pts.append(pts[i])

    #print("unique things gathered")
    #print(len(unique_lines))
    #assert len(set([str(unique_pts[i]) for i in range(len(unique_pts))])) == len(unique_pts)
    #print("asserted uniqueness of vectors")

    an_ccc = an.Analyst(
        embeddings=unique_pts[:MAX_LINES],
        strings=unique_lines[:MAX_LINES],
        metric=metric,
        auto_print=True,
        desc="ChitChatChallenge Utterance Hubs",
        #evaluators=["Nodal 4-Hubs"],
        calculate=True)

    print("Success at saving ChitChatChallenge Utterance Hubs: " +
          str(an.Analyst.save(an_ccc, filename)))

    a = an.load(filename)

    hubber = a.find_evaluator("Nodal 4-Hubs")
    hubs = hubber.get_clusters()
    sizes = [len(h) for h in hubs]
    order = np.argsort(sizes)[::-1]
    #order = np.argsort([h.stats_dict["Dispersion"] for h in hubs])
    hubs = np.array(hubs)[order]  #.tolist()
Beispiel #12
0
        return strings, embeddings

    # Fasttext:
    #   ordered by frequency, I think.
    #   non-normalized.
    #with open("embeddings/fasttext.en.pkl", 'rb') as f:
    with open("embeddings/fasttext.en.py2.pkl", 'rb') as f:
        data_ft = pkl.load(f)
    str_f = data_ft['tokens'][:MAX_LINES]
    str_f = list(map(str, str_f))

    # Universal Sentence Encoder:
    #   embeddings must be found by hand from things to encode.
    #   normalized.
    #   512 dimensions.
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/1"
    embed = hub.Module(module_url)
    tf.logging.set_verbosity(tf.logging.ERROR)
    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        embed_u = sess.run(embed(str_f))
    an_u = an.Analyst(embeddings=embed_u,
                      strings=str_f,
                      metric=metric,
                      auto_print=True,
                      desc="Universal Sentence Encoder with words")
    print("Success at saving Universal Sentence Encoder with words: " + str(
        an.Analyst.save(
            an_u, "saved_analyses/an" + str(MAX_LINES) +
            "_universal_sentence_encoder_with_words")))
Beispiel #13
0
import nearest
import analyst
import preview as pv

n = nearest.Nearest()
test_img_data = pv.load_batch("test_batch")
a = analyst.Analyst(test_img_data)
predictions = n.predict_all(test_img_data)
print(predictions)


print(a.score(predictions))


Beispiel #14
0
            row = lines[i + firstline].split(" ")
            strings.append(row[0])#str(row[0]))
            embeddings[i] = row[1:]
        return strings, embeddings

    # Fasttext:
    #   ordered by frequency, I think.
    #   non-normalized.
    #with open("embeddings/fasttext.en.pkl", 'rb') as f:
    with open("embeddings/fasttext.en.py2.pkl", 'rb') as f:
        data_ft = pkl.load(f)
    str_f = data_ft['tokens'][:MAX_LINES]
    str_f = list(map(str, str_f))
    embed_f = data_ft['vectors'][:MAX_LINES]
    embed_fn = embed_f#np.array([normalize(v) for v in embed_f])
    anag = an.evaluators.analogizer.Analogizer(
        analogies_path="/mnt/pccfs/backed_up/zac/zac_desktop/zac_docs/Corpora/"
            "subcorp_analogy_storage/analogy_subcorp5_family_relations")
    anagc = an.evaluators.analogizer_combiner.AnalogizerCombiner()
    an_fnc = an.Analyst(
        embeddings=embed_fn,
        strings=str_f,
        auto_print=True,
        metric=metric,
        desc="Fasttext Non-Normalized Euclidean",
        evaluators=[u"All", anag, anagc])

    file_name = "saved_analyses/an" + str(MAX_LINES) + \
        "_fasttext_non-normalized_euclidean"
    print("Success at saving: " + str(an.Analyst.save(an_fnc, file_name)))
Beispiel #15
0
        lines = open(path, 'rt').readlines()
        if firstline:
            numvecs, dim = map(int, lines[0].split(" "))
        else:
            numvecs = len(lines) if limit_lines == None \
                else min(len(lines), limit_lines)
            dim = len(lines[0].split(" ")) - 1
        strings = []
        embeddings = np.empty(shape=(numvecs, dim))
        for i in tqdm(range(numvecs), desc="Reading " + path):
            row = lines[i + firstline].split(" ")
            strings.append(row[0])  #str(row[0]))
            embeddings[i] = row[1:]
        return strings, embeddings

    # GloVe:
    #   ordered by frequency, I think.
    #   non-normalized.
    str_g, embed_g = read_text_table("embeddings/glove.6B.300d.txt",
                                     firstline=False,
                                     limit_lines=MAX_LINES)
    embed_g = [normalize(v) for v in embed_g]
    an_g = an.Analyst(embeddings=embed_g,
                      strings=str_g,
                      metric=metric,
                      auto_print=True,
                      desc="GloVe Normalized")
    print("Success at saving GloVe Normalized: " + str(
        an.Analyst.save(
            an_g, "saved_analyses/an" + str(MAX_LINES) + "_glove_normalized")))
Beispiel #16
0
    def use_lite(allowed_str):
        # Universal Sentence Encoder:
        #   embeddings must be found by hand from things to encode.
        #   normalized.
        #   512 dimensions.
        an_u = an.load(fnames[6], verbosity=1)
        if an_u is not None:
            an_u.add_evaluators(get_e())
            an_u.analysis(print_report=False)
            an_u.save()
        else:
            import tensorflow as tf
            import tensorflow_hub as hub
            import sentencepiece as spm

            def process_to_IDs_in_sparse_format(sp, sentences):
                # An utility method that processes sentences with the sentence piece processor
                # 'sp' and returns the results in tf.SparseTensor-similar format:
                # (values, indices, dense_shape)
                ids = [sp.EncodeAsIds(x) for x in sentences]
                max_len = max(len(x) for x in ids)
                dense_shape = (len(ids), max_len)
                values = [item for sublist in ids for item in sublist]
                indices = [[row, col] for row in range(len(ids))
                           for col in range(len(ids[row]))]
                return (values, indices, dense_shape)

            with tf.Session() as sess:
                module = hub.Module(
                    "https://tfhub.dev/google/universal-sentence-encoder-lite/2"
                )
                spm_path = sess.run(module(signature="spm_path"))
                # spm_path now contains a path to the SentencePiece model stored inside the
                # TF-Hub module

                sp = spm.SentencePieceProcessor()
                sp.Load(spm_path)

                input_placeholder = tf.sparse_placeholder(tf.int64,
                                                          shape=[None, None])
                embedder = module(
                    inputs=dict(values=input_placeholder.values,
                                indices=input_placeholder.indices,
                                dense_shape=input_placeholder.dense_shape))

                sess.run([
                    tf.global_variables_initializer(),
                    tf.tables_initializer()
                ])

                batches = [
                    allowed_str[b:b + 10000]
                    for b in range(0, len(allowed_str), 10000)
                ]
                embeddings = []
                for b in batches:
                    values, indices, dense_shape = process_to_IDs_in_sparse_format(
                        sp, b)
                    embeddings.append(
                        sess.run(embedder,
                                 feed_dict={
                                     input_placeholder.values: values,
                                     input_placeholder.indices: indices,
                                     input_placeholder.dense_shape: dense_shape
                                 }))
                embeddings = np.vstack(embeddings)

                an_u = an.Analyst(embeddings=embeddings,
                                  strings=allowed_str,
                                  metric=metric,
                                  auto_print=printing,
                                  desc="USE Lite",
                                  parallel_count=cpus,
                                  evaluators=get_e(),
                                  auto_save=2,
                                  file_name=fnames[6],
                                  over_write=True)
Beispiel #17
0
        "4_city_state",
        "5_family_relations",
        "6_adj_adverb",
        "7_opposites",
        "8_comparative",
        "9_superlative",
        "10_present_participle",
        "11_nationality_adj",
        "12_past_tense",
        "13_plural",
        "14_plural_verbs",
    ]

    corpora = [
        an.evaluators.analogizer.Analogizer(category="Analogies_" + p,
                                            analogies_path=path_start + p)
        for p in path_ends
    ]
    anagc = an.evaluators.analogizer_combiner.AnalogizerCombiner()
    an_fnc = an.Analyst(
        embeddings=None,
        strings=model.vocab,
        encoder=model.__getitem__,
        auto_print=True,
        metric=metric,
        desc="Word2Vec Analogies",
        evaluators=[anagc] + corpora,  # + ["all"],
        auto_save=True,
        over_write=True,
    )
Beispiel #18
0
        for i in tqdm(range(numvecs), desc="Reading " + path):
            row = lines[i + firstline].split(" ")
            strings.append(row[0])#str(row[0]))
            embeddings[i] = row[1:]
        return strings, embeddings

    # Fasttext:
    #   ordered by frequency, I think.
    #   non-normalized.
    #with open("embeddings/fasttext.en.pkl", 'rb') as f:
    with open("embeddings/fasttext.en.py2.pkl", 'rb') as f:
        data_ft = pkl.load(f)
    str_f = data_ft['tokens'][:MAX_LINES]
    str_f = list(map(str, str_f))

    # Word2vec (GoogleNews):
    #   non-normalized.
    #   unordered, from gensim's dict-like structure.
    model_w = gensim.models.KeyedVectors.load_word2vec_format(
        'embeddings/GoogleNews-vectors-negative300.bin', binary=True)
    #common_w = list(filter(lambda w: w in model_w.vocab.keys() \
    #    or bytes(w) in model_w.vocab.keys(), str_f))
    common_w = [w for w in str_f if w in model_w.vocab.keys()]
    embed_w = [normalize(model_w.get_vector(w)) for w in common_w]
    an_w = an.Analyst(embeddings=embed_w, strings=common_w, metric=metric,
        auto_print=True, desc="Word2Vec GoogleNews Normalized")
    print("Success at saving Word2Vec GoogleNews Normalized: " +
        str(an.Analyst.save(an_w,
            "saved_analyses/an" + str(MAX_LINES) + "_word2vec_googlenews_normalized")))

Beispiel #19
0
        raise ValueError("No matching vector")

    def encode_real(word):
        return vectors_real[words.index(word)]

    def encode_fake(word):
        return vectors_fake[words.index(word)]

    def metric(vec1, vec2):
        #return s.angle(vec1, vec2)*180/np.pi
        return sp.distance.cosine(vec1, vec2) * 180 / np.pi

    an_real = an.Analyst(
        embeddings=vectors_real,
        strings=words,
        metric=metric,
        #encoder=encode_real, decoder=decode_real,
        auto_print=True,
        desc="real scholar words")
    an_fake = an.Analyst(
        embeddings=vectors_fake,
        strings=words,
        metric=metric,
        #encoder=encode_fake, decoder=decode_fake,
        auto_print=True,
        desc="fake scholar words")

    worked_r = an_real.save(an_real, "analyst_project/an_scholar400_real")
    worked_f = an_fake.save(an_fake, "analyst_project/an_scholar400_fake")

    assert worked_r