class WikiQA(CastorPairDataset): NAME = 'wikiqa' NUM_CLASSES = 2 ID_FIELD = Field(sequential=False, dtype=torch.FloatTensor, use_vocab=False, batch_first=True) AID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True) TEXT_FIELD = Field(batch_first=True, tokenize=lambda x: x) # tokenizer is identity since we already tokenized it to compute external features EXT_FEATS_FIELD = Field(dtype=torch.FloatTensor, use_vocab=False, batch_first=True, tokenize=lambda x: x) LABEL_FIELD = Field(sequential=False, use_vocab=False, batch_first=True) RAW_TEXT_FIELD = RawField() VOCAB_SIZE = 0 @staticmethod def sort_key(ex): return len(ex.sentence_1) def __init__(self, path): """ Create a WIKIQA dataset instance """ super(WikiQA, self).__init__(path) @classmethod def splits(cls, path, train='train', validation='dev', test='test', **kwargs): return super().splits(path, train=train, validation=validation, test=test, **kwargs) @classmethod def iters(cls, path, vectors_name, vectors_dir, batch_size=64, shuffle=True, device=0, pt_file=False, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_dir: directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param pt_file: load cached embedding file from disk if it is true :param unk_init: function used to generate vector for OOV words :return: """ train, validation, test = cls.splits(path) if not pt_file: if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_dir, unk_init=unk_init) cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors) else: cls.TEXT_FIELD.build_vocab(train, validation, test) cls.TEXT_FIELD = cls.set_vectors(cls.TEXT_FIELD, os.path.join(vectors_dir, vectors_name)) cls.LABEL_FIELD.build_vocab(train, validation, test) cls.VOCAB_SIZE = len(cls.TEXT_FIELD.vocab) return BucketIterator.splits((train, validation, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
def iters(cls, batch_size: int =32, device: int = 0, root: str ='.data', vectors: Tensor = None, **kwargs) -> Tuple[Iterator, Iterator, Iterator]: text = Field() label = Field(sequential=False) train, valid, test = cls.splits(text, label, root=root, **kwargs) text.build_vocab(train, vectors=vectors) label.build_vocab(train) return BucketIterator.splits( (train, test), batch_size=batch_size, device=device)
class SICK(CastorPairDataset): NAME = 'sick' NUM_CLASSES = 5 ID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True) TEXT_FIELD = Field(batch_first=True, tokenize=lambda x: x) # tokenizer is identity since we already tokenized it to compute external features EXT_FEATS_FIELD = Field(dtype=torch.FloatTensor, use_vocab=False, batch_first=True, tokenize=lambda x: x) LABEL_FIELD = Field(sequential=False, dtype=torch.FloatTensor, use_vocab=False, batch_first=True, postprocessing=Pipeline(get_class_probs)) RAW_TEXT_FIELD = RawField() @staticmethod def sort_key(ex): return len(ex.sentence_1) def __init__(self, path): """ Create a SICK dataset instance """ super().__init__(path) @classmethod def splits(cls, path, train='train', validation='dev', test='test', **kwargs): return super(SICK, cls).splits(path, train=train, validation=validation, test=test, **kwargs) @classmethod def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, val, test = cls.splits(path) cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
class Semeval(Dataset): NAME = 'Semeval' NUM_CLASSES = 2 QID_FIELD = Field(sequential=False, tensor_type=torch.FloatTensor, use_vocab=False, batch_first=True) QAID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True) TEXT_FIELD = Field( batch_first=True, tokenize=lambda x: x ) # tokenizer is identity since we already tokenized it to compute external features EXT_FEATS_FIELD = Field(sequential=False, tensor_type=torch.FloatTensor, use_vocab=False, batch_first=True, tokenize=lambda x: x) LABEL_FIELD = Field(sequential=False, tensor_type=torch.FloatTensor, use_vocab=False, batch_first=True, postprocessing=Pipeline(get_class_probs)) RAW_TEXT_FIELD = RawField() @staticmethod def sort_key(ex): return len(ex.sentence_1) def __init__(self, path, **kwargs): """ Create a Semeval dataset instance """ fields = [('qid', self.QID_FIELD), ('qaid', self.QID_FIELD), ('label', self.LABEL_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD)] examples = [] with open(path) as infile: for line in infile: content = json.loads(line) sent_list_1 = content['question'] sent_list_2 = content['qaquestion'] word_to_doc_cnt = get_pairwise_word_to_doc_freq( sent_list_1, sent_list_2) overlap_feats = get_pairwise_overlap_features( sent_list_1, sent_list_2, word_to_doc_cnt) overlap_feats = [] values = [ content['qid'], content['qaid'], content['qarel'], content['question'], content['qaquestion'], ' '.join(content['question']), ' '.join(content['qaquestion']), overlap_feats ] examples.append(Example.fromlist(values, fields)) super(Semeval, self).__init__(examples, fields, **kwargs) @classmethod def splits(cls, path, train='train_2016.json', validation='dev_2016.json', test='test_2017.json', **kwargs): return super(Semeval, cls).splits(path, train=train, validation=validation, test=test, **kwargs) @classmethod def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_dir: directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param pt_file: load cached embedding file from disk if it is true :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, validation, test = cls.splits(path) cls.LABEL_FIELD.build_vocab(train, validation, test) cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors) return BucketIterator.splits((train, validation, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
choices=['DROPOUT', 'BN_RELU', 'RELU_BN'], default='DROPOUT', type=str) parser.add_argument('--dropout', action='store', default=0.5, type=float) parser.add_argument('--wd', action='store', default=1e-4, type=float) parser.add_argument('--model-conf', action='store', default=None, type=str) params = parser.parse_args() kvs = [(k, v) for k, v in vars(params).items()] kvs.append(('Device', device)) print_kv_box('Current Configuration', kvs) if params.mode == 'debug': tokenizer = WordToCharTokenizer() text_field = Field(tokenize=tokenizer, batch_first=True) ds = RandomizedTextWindowDataset(params.dataset, text_field, params.window_size, topk=params.topk, newline_eos=False) text_field.build_vocab(ds) train_ds, test_ds = ds.split(0.8) iterator = NoisedPreWindowedIterator(train_ds, params.batch_size, params.window_size, 0.0) iterator = PredictMiddleNoisedWindowIterator(iterator, 1) for b in iterator: print(b) i = 1 # model = MLP(51, 27, 1024, 3) # text_field = Field(tokenize=tokenize, batch_first=True)
class TRECQA(CastorPairDataset): NAME = 'trecqa' NUM_CLASSES = 2 ID_FIELD = Field(sequential=False, tensor_type=torch.FloatTensor, use_vocab=False, batch_first=True) AID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True) TEXT_FIELD = Field( batch_first=True, tokenize=lambda x: x ) # tokenizer is identity since we already tokenized it to compute external features EXT_FEATS_FIELD = Field( tensor_type=torch.FloatTensor, use_vocab=False, batch_first=True, tokenize=lambda x: x, postprocessing=Pipeline(lambda arr, _, train: [float(y) for y in arr])) LABEL_FIELD = Field(sequential=False, use_vocab=False, batch_first=True) VOCAB_SIZE = 0 @staticmethod def sort_key(ex): return len(ex.sentence_1) def __init__(self, path): """ Create a TRECQA dataset instance """ super(TRECQA, self).__init__(path, load_ext_feats=True) @classmethod def splits(cls, path, train='train-all', validation='raw-dev', test='raw-test', **kwargs): return super(TRECQA, cls).splits(path, train=train, validation=validation, test=test, **kwargs) @classmethod def set_vectors(cls, field, vector_path): if os.path.isfile(vector_path): stoi, vectors, dim = torch.load(vector_path) field.vocab.vectors = torch.Tensor(len(field.vocab), dim) for i, token in enumerate(field.vocab.itos): wv_index = stoi.get(token, None) if wv_index is not None: field.vocab.vectors[i] = vectors[wv_index] else: # initialize <unk> with uniform_(-0.05, 0.05) vectors field.vocab.vectors[i] = torch.FloatTensor(dim).uniform_( -0.05, 0.05) else: print("Error: Need word embedding pt file") exit(1) return field @classmethod def iters(cls, path, vectors_name, vectors_dir, batch_size=64, shuffle=True, device=0, pt_file=False, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_dir: directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ train, validation, test = cls.splits(path) if not pt_file: if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_dir, unk_init=unk_init) cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors) else: cls.TEXT_FIELD.build_vocab(train, validation, test) cls.TEXT_FIELD = cls.set_vectors( cls.TEXT_FIELD, os.path.join(vectors_dir, vectors_name)) cls.LABEL_FIELD.build_vocab(train, validation, test) cls.VOCAB_SIZE = len(cls.TEXT_FIELD.vocab) return BucketIterator.splits((train, validation, test), batch_size=batch_size, repeat=False, shuffle=shuffle, device=device)
class MSRP(Dataset): NAME = 'msrp' NUM_CLASSES = 2 EXT_FEATS = 6 ID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True) TEXT_FIELD = Field(batch_first=True, tokenize=lambda x: x) # tokenizer is identity since we already tokenized it EXT_FEATS_FIELD = Field(dtype=torch.float32, use_vocab=False, batch_first=True, tokenize=lambda x: x) LABEL_FIELD = Field(sequential=False, use_vocab=False, batch_first=True) RAW_TEXT_FIELD = RawField() NUMBER_PATTERN = re.compile(r'((\d+,)*\d+\.?\d*)') @staticmethod def sort_key(ex): return len(ex.sentence_1) def __init__(self, path): """ Create a MSRP dataset instance """ fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)] examples = [] with open(os.path.join(path, 'a.toks'), 'r') as f1, open(os.path.join(path, 'b.toks'), 'r') as f2: sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1] sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2] word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2) self.word_to_doc_cnt = word_to_doc_cnt with open(os.path.join(path, 'id.txt'), 'r') as id_file, open(os.path.join(path, 'sim.txt'), 'r') as label_file: for pair_id, l1, l2, label in zip(id_file, sent_list_1, sent_list_2, label_file): pair_id = pair_id.rstrip('.\n') label = label.rstrip('.\n') ext_feats = [] # Number features sent1_nums, sent2_nums = [], [] match = self.NUMBER_PATTERN.search(' '.join(l1)) if match: for g in match.groups(): if g is not None: sent1_nums.append(g) match = self.NUMBER_PATTERN.search(' '.join(l2)) if match: for g in match.groups(): if g is not None: sent2_nums.append(g) sent1_nums = set(sent1_nums) sent2_nums = set(sent2_nums) exact = int(sent1_nums == sent2_nums) superset = int(sent1_nums.issuperset(sent2_nums) or sent2_nums.issuperset(sent1_nums)) ext_feats.append(1 if (exact or (len(sent1_nums) == 0 and len(sent2_nums) == 0)) else 0) ext_feats.append(exact) ext_feats.append(superset) # Length difference ext_feats.append(len(l2) - len(l1)) # Overlap overlap = len(set(l1) & set(l2)) ext_feats.append(overlap / len(l1)) ext_feats.append(overlap / len(l2)) example = Example.fromlist([pair_id, l1, l2, ext_feats, label, ' '.join(l1), ' '.join(l2)], fields) examples.append(example) super(MSRP, self).__init__(examples, fields) @classmethod def _read_file(cls, fn): lines = [] with open(fn, 'r') as f: for line in f: lines.append(line) return lines @classmethod def splits(cls, path, train='train', test='test', **kwargs): # Create temporary files to split train into train and dev uid = uuid.uuid4() train_tmp, dev_tmp = f'{train}-tmp-{uid}', f'dev-tmp-{uid}' pathlib.Path(os.path.join(path, train_tmp)).mkdir(parents=True, exist_ok=True) pathlib.Path(os.path.join(path, dev_tmp)).mkdir(parents=True, exist_ok=True) train_id = cls._read_file(os.path.join(path, train, 'id.txt')) train_a_toks = cls._read_file(os.path.join(path, train, 'a.toks')) train_b_toks = cls._read_file(os.path.join(path, train, 'b.toks')) train_sim = cls._read_file(os.path.join(path, train, 'sim.txt')) dev_lines = np.random.choice(np.arange(len(train_id)), size=400, replace=False) train_tmp_id_path = os.path.join(path, train_tmp, 'id.txt') train_tmp_sim_path = os.path.join(path, train_tmp, 'sim.txt') train_tmp_a_toks = os.path.join(path, train_tmp, 'a.toks') train_tmp_b_toks = os.path.join(path, train_tmp, 'b.toks') dev_tmp_id_path = os.path.join(path, dev_tmp, 'id.txt') dev_tmp_sim_path = os.path.join(path, dev_tmp, 'sim.txt') dev_tmp_a_toks = os.path.join(path, dev_tmp, 'a.toks') dev_tmp_b_toks = os.path.join(path, dev_tmp, 'b.toks') counter = 0 with open(train_tmp_id_path, 'w') as tid, open(train_tmp_sim_path, 'w') as tsim, open(train_tmp_a_toks, 'w') as ta, open(train_tmp_b_toks, 'w') as tb,\ open(dev_tmp_id_path, 'w') as did, open(dev_tmp_sim_path, 'w') as dsim, open(dev_tmp_a_toks, 'w') as da, open(dev_tmp_b_toks, 'w') as db: for i, (pid, sa, sb, sim) in enumerate(zip(train_id, train_a_toks, train_b_toks, train_sim)): counter += 1 if i in dev_lines: did.write(pid) dsim.write(sim) da.write(sa) db.write(sb) else: tid.write(pid) tsim.write(sim) ta.write(sa) tb.write(sb) split_results = super(MSRP, cls).splits(path, train=train_tmp, validation=dev_tmp, test=test, **kwargs) shutil.rmtree(os.path.join(path, train_tmp)) shutil.rmtree(os.path.join(path, dev_tmp)) return split_results @classmethod def iters(cls, path, vectors_name, vectors_cache, device, batch_size=64, shuffle=True, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to word vectors file :param device: PyTorch device :param batch_size: batch size :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, validation, test = cls.splits(path) cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors) return BucketIterator.splits((train, validation, test), batch_size=batch_size, repeat=False, shuffle=shuffle, device=device)