class Base_Dataset: def __init__(self, trainset_path, testset_path, vocab_path, dataset_name='', remove_entity_mention=False, remove_stop_words=False): self.config = config[dataset_name] self.train_set, self.train_corpus = self.load_dataset( trainset_path, remove_entity_mention, remove_stop_words) self.test_set, self.test_corpus = self.load_dataset( testset_path, remove_entity_mention, remove_stop_words) self.corpus = self.train_corpus + self.test_corpus # if not os.path.isfile(vocab_path): # self.__build_vocab(self.corpus, vocab_path) if os.path.isfile(vocab_path): self.vocab = Vocab(filename=vocab_path, data=['<ukn>', '<ent>', '<num>']) self.unknown = self.vocab.getIndex('<ukn>') self.word_vectorizer = Glove(self.vocab, config['glove_path'], self.config['emb']) for qa_row in self.train_set + self.test_set: for relation in qa_row.sparql.relations: relation.coded = self.decode(relation) # self.__update_relations_emb() self.coded_train_corpus = [[ self.vocab.getIndex(word, self.unknown) for word in tokens ] for tokens in self.train_corpus] self.coded_test_corpus = [[ self.vocab.getIndex(word, self.unknown) for word in tokens ] for tokens in self.test_corpus] self.vocab_path = vocab_path self.one_hop = None if os.path.isfile(self.config['entity_one_hop']): with open(self.config['entity_one_hop'], 'rb') as f: self.one_hop = pk.load(f) def decode(self, relation, max_length=3): idxs = self.vocab.convertToIdx( map(str.lower, relation.tokens[:max_length]), self.unknown) length = len(idxs) if len(idxs) < max_length: idxs = idxs + [0] * (max_length - len(idxs)) return torch.LongTensor(idxs), length def load_dataset(self, dataset_path, remove_entity_mention, remove_stop_words): return [], [] def __load_candidate_relations(self): vocab = set() # if not os.path.exists(self.config['rel2id']): # for qa_row in self.train_set + self.test_set: # for relation in qa_row.sparql.relations: # vocab |= set(map(str.lower, relation.tokens)) # return vocab with open(self.config['rel2id'], 'rb') as f_h: rel2id = pk.load(f_h, encoding='latin1') for item_id, item in rel2id.items(): words = [word.lower().replace('.', '') for word in item[2]] vocab |= set(words) if os.path.isfile(self.config['entity_one_hop']): with open(self.config['entity_one_hop'], 'rb') as f: one_hop = pk.load(f) print(len(vocab)) for entity, uris in one_hop.items(): for idx in range(len(uris)): uri, label = uris[idx][:2] label = re.sub(r"([A-Z])", r" \1", label).replace('_', ' ').replace('.', ' ') words = list(map(str.lower, label.split(' '))) vocab |= set(words) print(len(vocab)) return vocab def __update_relations_emb(self): emb_shape = self.word_vectorizer.emb.shape emb = nn.Embedding(emb_shape[0], emb_shape[1], padding_idx=0, sparse=False) emb.weight.data.copy_(self.word_vectorizer.emb) if torch.cuda.is_available(): emb.cuda() with open(self.config['rel2id'], 'rb') as f_h: rel2id = pk.load(f_h, encoding='latin1') ## Need to fix cases where there are non-alphabet chars in the label max_length = 3 for item_id, item in rel2id.items(): if len(item[2]) > max_length: idxs = [] else: idxs = [ self.vocab.getIndex( word.lower().replace('.', '') if not word.replace( '.', '').replace('(', '').isdigit() else '<num>') for word in item[2] ] idxs = [id for id in idxs if id is not None] length = len(idxs) if length == 0: length = 1 if len(idxs) < max_length: idxs = idxs + [0] * (max_length - len(idxs)) idxs = torch.LongTensor(idxs) item[5] = idxs if len(item) == 6: item.append(length) else: item[6] = length with open(self.config['rel2id'], 'wb') as f_h: pk.dump(rel2id, f_h) def __build_vocab(self, lines, vocab_path): vocab = set() for tokens in lines: vocab |= set(tokens) relations_vocab = self.__load_candidate_relations() vocab |= relations_vocab vocab = [ w for w in vocab if not w.replace('.', '').replace('(', '').isdigit() ] if '<ent>' in vocab: vocab.remove('<ent>') with open(vocab_path, 'w', encoding='utf-8') as f: for token in sorted(vocab): f.write(token + '\n') def find_one_hop_relations(self, entities): extra_candidates = [] if self.one_hop is not None: for entity in entities: if entity in self.one_hop: extra_candidates.extend(self.one_hop[entity]) return extra_candidates
# zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate([ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]): if idx == 0: emb[idx].fill_(10e-3) if idx == 1: emb[idx].fill_(10e-1) if idx == 2: emb[idx].fill_(1) if idx == 3: emb[idx].fill_(2) for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex( word)] torch.save(emb, emb_file) ## build dataset for treelstm # load imdb dataset splits train_dir = classificationConfig.token_file_labels[0] train_file = os.path.join(Global.external_tools, 'imdb_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = IMDBdataset(train_dir, vocab, args.num_classes) torch.save(train_dataset, train_file) # train_dataset = torch.load(train_file) logger.debug('==> Size of train data : %d ' % len(train_dataset))