def load_embed_file(self): self._str2embed = dict(zip(self.SPECIAL_TOKENS, range(self.START_IDX))) self._embed2str = dict(zip(range(self.START_IDX), self.SPECIAL_TOKENS)) embeds = [[0] * self.words_dim, [0] * self.words_dim] with open(self.embed_file) as f: cur_idx = self.START_IDX for line_num, line in enumerate(f): line = line.strip().split() if line: try: if self.dataset_type != 'SST-1' or self.dataset_type != 'SST-2': self._str2embed[clean_str(line[0])] = cur_idx self._embed2str[cur_idx] = clean_str(line[0]) else: self._str2embed[clean_str_sst(line[0])] = cur_idx self._embed2str[cur_idx] = clean_str_sst(line[0]) embeds.append(line[1:]) cur_idx += 1 except: raise ValueError( 'The embedding file is misformatted at line %d' % (line_num + 1)) # Randomly initialize the pre-trained vector for those words not in pre-train-file for word in self._str2idx.keys(): if word not in self._str2embed.keys(): self._str2embed[word] = cur_idx self._embed2str[cur_idx] = word embeds.append(list(np.random.uniform(-1, 1, self.words_dim))) cur_idx += 1 self.pretrained_embeddings = np.array(embeds, dtype=np.float64) del embeds return
def add_train_file(self): if self.dataset_type == 'TREC': with open(self.train_file) as f: for line_num, line in enumerate(f): line = clean_str(line).split() if line: if self.name == 'Targets': self.add(line[0]) if self.name == 'Words': for word in line[2:]: self.add(word) else: with open(self.train_file) as f: for line_num, line in enumerate(f): line = clean_str(line).split() if line: if self.name == 'Targets': self.add(line[0]) if self.name == 'Words': for word in line[1:]: self.add(word) self.index_vocab()
def reading_dataset(self, filename): """ :param filename: :return: """ if self.dataset_type == 'SST-1' or self.dataset_type == 'SST-2': with open(filename) as f: buff = [] for line_num, line in enumerate(f): line = clean_str_sst(line).split() if len(line) > 1: buff.append(line) self._process_buff(buff) else: with open(filename) as f: buff = [] for line_num, line in enumerate(f): line = clean_str(line).split() if line: buff.append(line) self._process_buff(buff) return