def __init__(self, word_embed_size=100, pos_embed_size=100, word_embed_file=None, word_preprocess=text.replace_number, word_unknown="UNKNOWN", embed_dtype='float32'): super().__init__(reader=ConllReader()) self.use_pretrained = word_embed_file is not None word_vocab = text.EmbeddingVocab( word_unknown, file=word_embed_file, dim=word_embed_size, dtype=embed_dtype, initializer=text.EmbeddingVocab.random_normal) pos_vocab = text.EmbeddingVocab( dim=pos_embed_size, dtype=embed_dtype, initializer=text.EmbeddingVocab.random_normal) self.add_processor('word', vocab=word_vocab, preprocess=word_preprocess) self.add_processor('pos', vocab=pos_vocab, preprocess=lambda x: x.lower()) self.label_map = text.Dict()
def test_embeddings_serialize_deserialize(self): v1 = text.EmbeddingVocab(serialize_embeddings=False) self.assertEqual(v1.add("Pierre"), 1) self.assertEqual(v1.add("Vinken"), 2) self.assertEqual(v1.get_embeddings().shape, (3, 50)) with tempfile.TemporaryFile() as f: pickle.dump(v1, f) self.assertEqual(v1.get_embeddings().shape, (3, 50)) f.seek(0) v2 = pickle.load(f) self.assertIsNot(v1, v2) self.assertEqual(v1._dict, v2._dict) self.assertRaises(RuntimeError, lambda: v2.get_embeddings()) self.assertEqual(v2.add("nonexecutive"), 3) self.assertEqual(v2.add("director"), 4) self.assertRaises(RuntimeError, lambda: v2.get_embeddings()) v3 = text.EmbeddingVocab(serialize_embeddings=True, dim=16) self.assertEqual(v3.add("Pierre"), 1) self.assertEqual(v3.add("Vinken"), 2) self.assertEqual(v3.get_embeddings().shape, (3, 16)) with tempfile.TemporaryFile() as f: pickle.dump(v3, f) self.assertEqual(v3.get_embeddings().shape, (3, 16)) f.seek(0) v4 = pickle.load(f) self.assertIsNot(v3, v4) self.assertEqual(v3._dict, v4._dict) self.assertEqual(v4.get_embeddings().shape, (3, 16)) self.assertEqual(v4.add("nonexecutive"), 3) self.assertEqual(v4.add("director"), 4) self.assertEqual(v4.get_embeddings().shape, (5, 16))
def __init__(self, word_embed_size=100, postag_embed_size=50, char_embed_size=10, word_embed_file=None, word_preprocess=text.lower, word_unknown="<UNK>", filter_coord=False, format='tree'): if format == 'tree': read_genia = False elif format == 'genia': read_genia = True else: raise ValueError("Invalid data format: {}".format(format)) super().__init__(reader=reader.ZipReader([ GeniaReader() if read_genia else reader.TreeReader(), reader.CsvReader(delimiter=' '), reader.ContextualizedEmbeddingsReader(), ])) self.filter_coord = filter_coord self._updated = False self._postag_file = None self._cont_embed_file = None self._use_pretrained_embed = word_embed_file is not None self._read_genia = read_genia word_vocab = text.EmbeddingVocab( file=word_embed_file, unknown=word_unknown, dim=word_embed_size, initializer=text.EmbeddingVocab.random_normal, serialize_embeddings=True) postag_vocab = text.EmbeddingVocab( unknown=word_unknown, dim=postag_embed_size, initializer=text.EmbeddingVocab.random_normal, serialize_embeddings=True) char_vocab = text.EmbeddingVocab( unknown=word_unknown, dim=char_embed_size, initializer=text.EmbeddingVocab.random_normal, serialize_embeddings=True) for word in CC_KEY + CC_SEP: word_vocab.add(word) self.char_pad_id = char_vocab.add(_CHAR_PAD) self.add_processor('word', word_vocab, preprocess=word_preprocess) self.add_processor('pos', postag_vocab, preprocess=False) self.add_processor('char', char_vocab, preprocess=False)
def __init__(self, word_embed_size=100, postag_embed_size=100, word_embed_file=None, word_preprocess=text.lower, word_unknown="<UNK>", input_file=None, min_frequency=2): super().__init__(reader=ConllReader()) words = [] if input_file is not None: self._fix_word = True counter = Counter([word_preprocess(token['form']) for sentence in self._reader.read(input_file) for token in sentence]) for word, count in counter.most_common(): if count < min_frequency: break words.append(word) else: self._fix_word = False word_vocab = text.EmbeddingVocab.from_words( words, unknown=word_unknown, dim=word_embed_size, initializer=(text.EmbeddingVocab.random_normal if word_embed_file is None else np.zeros), serialize_embeddings=True) pretrained_word_vocab = text.EmbeddingVocab( unknown=word_unknown, file=word_embed_file, dim=word_embed_size, initializer=np.zeros, serialize_embeddings=True) pretrained_word_vocab.add(word_preprocess("<ROOT>")) postag_vocab = text.EmbeddingVocab( unknown=word_unknown, dim=postag_embed_size, initializer=text.EmbeddingVocab.random_normal, serialize_embeddings=True) self.add_processor( 'word', word_vocab, preprocess=word_preprocess) self.add_processor( 'pre', pretrained_word_vocab, preprocess=word_preprocess) self.add_processor('pos', postag_vocab, preprocess=False) self.rel_map = text.Dict()
def __init__(self, word_embed_size=100, postag_embed_size=50, char_embed_size=10, word_embed_file=None, word_preprocess=text.lower, word_unknown="<UNK>", filter_coord=False, format='tree'): super().__init__(reader=None) self.init_reader(format) self.filter_coord = filter_coord self._mode = None self._updated = False self._postag_file = None self._cont_embed_file = None self._use_pretrained_embed = word_embed_file is not None word_vocab = text.EmbeddingVocab( file=word_embed_file, unknown=word_unknown, dim=word_embed_size, initializer=text.EmbeddingVocab.random_normal, serialize_embeddings=True) postag_vocab = text.EmbeddingVocab( unknown=word_unknown, dim=postag_embed_size, initializer=text.EmbeddingVocab.random_normal, serialize_embeddings=True) char_vocab = text.EmbeddingVocab( unknown=word_unknown, dim=char_embed_size, initializer=text.EmbeddingVocab.random_normal, serialize_embeddings=True) for word in CC_KEY + CC_SEP: word_vocab.add(word) self.char_pad_id = char_vocab.add(_CHAR_PAD) self.add_processor('word', word_vocab, preprocess=word_preprocess) self.add_processor('pos', postag_vocab, preprocess=False) self.add_processor('char', char_vocab, preprocess=False)
def test_embeddings_from_file(self): raw = ("Pierre -0.00066023 -0.6566 0.27843 -0.14767\n" "Vinken 0.10204 -0.12792 -0.8443 -0.12181\n" "61 0.24968 -0.41242 0.1217 0.34527\n" "years -0.19181 -1.8823 -0.76746 0.099051\n" "old -0.52287 -0.31681 0.00059213 0.0074449\n") with tempfile.NamedTemporaryFile(mode='w') as f: f.write(raw) f.flush() v1 = text.EmbeddingVocab(file=f.name, dtype=np.float64) self.assertEqual(len(v1), 6) x1 = v1.get_embeddings() self.assertEqual(x1.shape, (6, 4)) np.testing.assert_array_equal( x1[1], [0.10204, -0.12792, -0.8443, -0.12181])
def test_embeddings(self): v = text.EmbeddingVocab(dim=32) self.assertEqual(v.add("Pierre"), 1) self.assertEqual(v.add("Vinken"), 2) self.assertEqual(v.get_embeddings().shape, (3, 32)) self.assertEqual(v.add("nonexecutive"), 3) self.assertEqual(v.add("director"), 4) self.assertEqual(v.get_embeddings().shape, (5, 32)) np.testing.assert_array_equal(v.get_embeddings(), v.get_embeddings()) self.assertIs(v.get_embeddings(), v.get_embeddings()) x1 = v.get_embeddings() self.assertEqual(v.add("61"), 5) self.assertEqual(v.add("years"), 6) self.assertEqual(v.add("old"), 7) x2 = v.get_embeddings() self.assertEqual(x2.shape, (8, 32)) self.assertIsNot(x1, x2) np.testing.assert_array_equal(x1, x2[:5])