Ejemplo n.º 1
0
if __name__ == "__main__":
    wordsim = Wordsim()
    word_analogy = WordAnalogy()
    word_analogy.set_top_words('../../data/processed data/top_30000_words.txt')

    suffix = '_1'
    dimension_list = [50, 100, 150, 200]
    wa_list = []
    ws_list = []

    for dimension in dimension_list:
        filename = '../../output/convergence_test/3000samples/31epochs/snml/{}dim/embedding-e={}-n_sampled=3000-epochs=31-batch_size=10000.txt'.format(
            dimension, dimension)
        print('Reading: ', filename)
        embedding = Embedding.from_file(filename)

        wa_result = word_analogy.evaluate(embedding,
                                          high_level_category=False,
                                          restrict_top_words=False)
        ws_result = wordsim.evaluate(embedding)

        wa_list.append(wa_result['all'])
        ws_list.append(ws_result['EN-WS-353-ALL'][2])

    print('Word analogy: ')
    for wa in wa_list:
        print(wa)
    print('Word sim: ')
    for ws in ws_list:
        print(ws)
Ejemplo n.º 2
0
            # accuracy
            acc = mean(result)

            # result
            print("Category: %-30s, accuracy: %f (all: %d)" %
                  (cat, acc, len(X_cat)))
            predictions[cat] = acc

        # overrall
        total_count = 0
        acc = 0
        for cat in cat_list:
            cat_count = len(X[cat])
            acc += cat_count * predictions.get(cat)
            total_count += cat_count
        predictions['all'] = acc / total_count
        print("All Category accuracy: %f" % (acc / total_count))

        return predictions


if __name__ == "__main__":
    word_analogy = WordAnalogy()
    word_analogy.set_top_words('../../data/processed data/top_30000_words.txt')
    embedding = Embedding.from_file(
        '../../output/50dim/embedding-e=50-n_sampled=200-epochs=35-batch_size=10000_1.txt'
    )
    result = word_analogy.evaluate(embedding,
                                   high_level_category=False,
                                   restrict_top_words=False)
Ejemplo n.º 3
0
    @staticmethod
    def pprint(result):
        from prettytable import PrettyTable
        x = PrettyTable(["Dataset", "Found", "Not Found", "Score (rho)"])
        x.align["Dataset"] = "l"
        for k, v in result.items():
            x.add_row([k,v[0],v[1],v[2]])
        print(x)

    def evaluate(self, embedding):
        result = {}
        for file_name, data in self.dataset.items():
            pred, label, found, notfound = [] ,[], 0, 0
            for datum in data:
                if embedding.in_vocab(datum[0]) and embedding.in_vocab(datum[1]):
                    found += 1
                    pred.append(umath.cos(embedding.vector(datum[0]), embedding.vector(datum[1])))
                    label.append(datum[2])
                else:
                    notfound += 1
            result[file_name] = (found, notfound, umath.rho(label,pred)*100)
        return result


if __name__ == "__main__":
    wordsim = Wordsim()
    embedding = Embedding.from_file('../../output/100dim/embedding-e=100-n_sampled=200-epochs=10-batch_size=10000.txt')
    result = wordsim.evaluate(embedding)
    # wordsim.pprint(result)
    print(result['EN-WS-353-ALL'][2])