예제 #1
0
	cluster = []
	# Read all sentence for this meaning
	for j in range(0,int(num_sentences)):
	  word_pos = f.readline()
	  contexts = f.readline().split() 
	  context_idxs = map2vocab(vocab, contexts)
	  example = word + '_' + str(i) + '.' + str(cnt)
          test_data[word].append((example, word_idx, context_idxs))
	  cluster.append(example)
	  cnt += 1

	ground_truth_clusters_dict[word][i] = cluster
      line = f.readline()
      #word_cnt += 1
      #if word_cnt > 2: break
  return ground_truth_clusters_dict, test_data
 
if __name__ == '__main__':
  # read embedding files                                                        
  embedding_filename = sys.argv[1]                                              
  context_embedding_filename = sys.argv[2]                                      
  print("Using embedding file: %s" % (embedding_filename))                      
  vocab, embeddings = read_embedding_file(embedding_filename)                   
  _, context_embeddings = read_embedding_file(context_embedding_filename)
 
  ground_truth_clusters_dict, test_data = read_wwsi_file(vocab)
  print("read wwsi file")
  iw2v_clusters_dict = clean_clusters(get_clusters(test_data, vocab, embeddings, context_embeddings))
  rand_index = avg_adj_rand_index(ground_truth_clusters_dict, iw2v_clusters_dict)
  print 'avg_rand_index: ', rand_index 
        elif opt in ("-c", "--context"):
            for a in arg.split('.'):
                context_to_plot.append(a.split(','))

    # extract dim and sparsity penalties from file names
    arr = process_embeddings_dir(rootDir)
    for input_embedding_file, context_embedding_file, sparsity, dim_penalty in arr:
        print 'Context embeddings file: ', context_embedding_file
        print 'Input embeddings file: ', input_embedding_file
        print 'sparsity penalty: ', str(sparsity)
        print 'dimension penalty: ', str(dim_penalty)
        for i, c in enumerate(context_to_plot):
            print str(i) + ': context to plot: ', " ".join(c)

        print "loading embeddings and vocabulary..."
        w_vocab, w_embeddings = read_embedding_file(input_embedding_file)
        # reduce words to sum over
        w_vocab = w_vocab[:k]
        w_embeddings = w_embeddings[:k]
        c_vocab, c_embeddings = read_embedding_file(context_embedding_file)
        # if a sentence is specified, get embeddings
        context_avg_vecs = []
        context_all_vecs = []
        for idx_s, s in context_to_plot:
            word_counter = 0
            for idx_w, word in enumerate(s):
                if idx_w == 0:
                    context_avg_vecs.append(np.zeros(len(c_embeddings[0])))
                    context_all_vecs.append([])
                vocab_idx = -1
                try:
        elif opt in ("-r", "--rootDir"):
            rootDir = arg
        elif opt in ("-w", "--words"):
            words_to_plot = arg.split(',')
        elif opt in ("-n", "--filename"):
            filename = arg

    # extract dim and sparsity penalties from file names
    arr = process_embeddings_dir(rootDir)
    for input_embedding_file, context_embedding_file, sparsity, dim_penalty in arr:
        print 'Input embeddings file: ', input_embedding_file
        print 'Context embeddings file: ', context_embedding_file
        print 'words_to_plot: ', ", ".join(words_to_plot)
        print 'sparsity penalty: ', str(sparsity)
        print 'dimension penalty: ', str(dim_penalty)

        print "loading embeddings and vocabulary..."
        in_vocab_all, in_embeddings_all = read_embedding_file(
            input_embedding_file)
        in_vocab = in_vocab_all[:k]
        in_embeddings = in_embeddings_all[:k]
        out_vocab_all, out_embeddings_all = read_embedding_file(
            context_embedding_file)
        out_vocab = out_vocab_all[:k]
        out_embeddings = out_embeddings_all[:k]

        plot(in_vocab_all, in_embeddings_all, words_to_plot,
             "heatmap_input.png")
        plot(in_vocab_all, out_embeddings_all, words_to_plot,
             "heatmap_context.png")
예제 #4
0
    for opt, arg in opts:
        if opt == '-h':
            print help_message
            sys.exit()
        elif opt in ("-i", "--ifile"):
            input_embedding_file = arg
        elif opt in ("-w", "--words"):
            words_to_plot = arg.split(',')
        elif opt in ("-k", "--numNeighbors"):
            num_of_nns_to_get = int(arg)

    print 'Input embeddings file: ', input_embedding_file
    print 'words_to_plot: ', ", ".join(words_to_plot)

    print "loading embeddings and vocabulary..."
    in_vocab, in_embeddings = read_embedding_file(input_embedding_file)
    in_vocab = in_vocab[:k]
    in_embeddings = in_embeddings[:k]
    d = len(in_embeddings[0])

    for plot_idx, word_to_plot in enumerate(words_to_plot):

        in_word_idx = in_vocab.index(word_to_plot)
        word_in_embedding = in_embeddings[in_word_idx]

        nn_idxs = get_nearest_neighbors(word_in_embedding, in_word_idx,
                                        in_embeddings, num_of_nns_to_get)

        t = []
        for i, idx in enumerate(nn_idxs):
            t.append([i + 1, in_vocab[idx]])