class Text2W2VEncoder: def __init__(self, data_path): self.w2v = BigFile(data_path) vocab_size, self.ndims = self.w2v.shape() print("Text2W2VEncoder", "vocab_size", vocab_size, "dim", self.ndims) def encode(self, words): renamed, vectors = self.w2v.read(words) if len(vectors) > 0: vec = np.array(vectors).mean(axis=0) else: vec = np.zeros([self.ndims]) return torch.Tensor(vec)
class W2Vec(Txt2Vec): def __init__(self, data_path, norm=0, clean=True): super(W2Vec, self).__init__(data_path, norm, clean) self.w2v = BigFile(data_path) vocab_size, self.ndims = self.w2v.shape() logger.info('vob size: %d, vec dim: %d' % (vocab_size, self.ndims)) def _encoding(self, words): renamed, vectors = self.w2v.read(words) if len(vectors) > 0: vec = np.array(vectors).mean(axis=0) else: vec = np.zeros(self.ndims, ) return vec