Beispiel #1
0
 def test_get_embeddings(self):
     v = SennaVocab()
     v.add("!")
     E = v.get_embeddings()
     e_exclamation = np.array([float(e) for e in """
     -1.03682 1.77856 -0.693547 1.5948 1.5799 0.859243 1.15221 -0.976317 0.745304 -0.494589 0.308086 0.25239
     -0.1976 1.26203 0.813864 -0.940734 -0.215163 0.11645 0.525697 1.95766 0.394232 1.27717 0.710788 -0.389351
     0.161775 -0.106038 1.14148 0.607948 0.189781 -1.06022 0.280702 0.0251156 -0.198067 2.33027 0.408584
     0.350751 -0.351293 1.77318 -0.723457 -0.13806 -1.47247 0.541779 -2.57005 -0.227714 -0.817816 -0.552209
     0.360149 -0.10278 -0.36428 -0.64853
     """.split()])
     self.assertTrue(np.allclose(e_exclamation, E[v.word2index["!"]]))
Beispiel #2
0
 def test_get_embeddings(self):
     v = SennaVocab()
     v.add("!")
     E = v.get_embeddings()
     e_exclamation = np.array([
         float(e) for e in """
     -1.03682 1.77856 -0.693547 1.5948 1.5799 0.859243 1.15221 -0.976317 0.745304 -0.494589 0.308086 0.25239
     -0.1976 1.26203 0.813864 -0.940734 -0.215163 0.11645 0.525697 1.95766 0.394232 1.27717 0.710788 -0.389351
     0.161775 -0.106038 1.14148 0.607948 0.189781 -1.06022 0.280702 0.0251156 -0.198067 2.33027 0.408584
     0.350751 -0.351293 1.77318 -0.723457 -0.13806 -1.47247 0.541779 -2.57005 -0.227714 -0.817816 -0.552209
     0.360149 -0.10278 -0.36428 -0.64853
     """.split()
     ])
     self.assertTrue(np.allclose(e_exclamation, E[v["!"]]))
Beispiel #3
0
    train_file = os.path.join(mydir, 'SemEval2010_task8_training', 'TRAIN_FILE.TXT')
    test_file = os.path.join(mydir, 'SemEval2010_task8_testing_keys', 'TEST_FILE_FULL.TXT')
    logging.basicConfig(level=logging.INFO)

    logging.info('starting preprocessing')
    if os.path.isfile('train.json') and os.path.isfile('test.json') and not args['--force']:
        logging.info('train.json and test.json already exists. Skipping proprocessing.')
    else:
        nlp = English()
        with open('train.json', 'wb') as f:
            json.dump(parse_file(train_file, nlp), f, indent=2)
        with open('test.json', 'wb') as f:
            json.dump(parse_file(test_file, nlp), f, indent=2)

    logging.info('starting numericalization')
    word_vocab = SennaVocab()
    rel_vocab = Vocab()

    with open('train.json') as f:
        train = json.load(f)
    with open('test.json') as f:
        test = json.load(f)

    numericalize(train, word_vocab, rel_vocab, add=True)
    word_vocab = word_vocab.prune_rares(cutoff=2)
    word_vocab = word_vocab.sort_by_decreasing_count()
    rel_vocab = rel_vocab.sort_by_decreasing_count()
    train = numericalize(train, word_vocab, rel_vocab, add=False)
    test = numericalize(test, word_vocab, rel_vocab, add=False)

    with open('vocab.pkl', 'wb') as f: