Exemple #1
0
class WikiQA(CastorPairDataset):
    NAME = 'wikiqa'
    NUM_CLASSES = 2
    ID_FIELD = Field(sequential=False, dtype=torch.FloatTensor, use_vocab=False, batch_first=True)
    AID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True)
    TEXT_FIELD = Field(batch_first=True, tokenize=lambda x: x)  # tokenizer is identity since we already tokenized it to compute external features
    EXT_FEATS_FIELD = Field(dtype=torch.FloatTensor, use_vocab=False, batch_first=True, tokenize=lambda x: x)
    LABEL_FIELD = Field(sequential=False, use_vocab=False, batch_first=True)
    RAW_TEXT_FIELD = RawField()
    VOCAB_SIZE = 0

    @staticmethod
    def sort_key(ex):
        return len(ex.sentence_1)

    def __init__(self, path):
        """
        Create a WIKIQA dataset instance
        """
        super(WikiQA, self).__init__(path)

    @classmethod
    def splits(cls, path, train='train', validation='dev', test='test', **kwargs):
        return super().splits(path, train=train, validation=validation, test=test, **kwargs)

    @classmethod
    def iters(cls, path, vectors_name, vectors_dir, batch_size=64, shuffle=True, device=0, pt_file=False, vectors=None,
              unk_init=torch.Tensor.zero_):
        """
        :param path: directory containing train, test, dev files
        :param vectors_name: name of word vectors file
        :param vectors_dir: directory containing word vectors file
        :param batch_size: batch size
        :param device: GPU device
        :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes
        :param pt_file: load cached embedding file from disk if it is true
        :param unk_init: function used to generate vector for OOV words
        :return:
        """

        train, validation, test = cls.splits(path)
        if not pt_file:
            if vectors is None:
                vectors = Vectors(name=vectors_name, cache=vectors_dir, unk_init=unk_init)
            cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors)
        else:
            cls.TEXT_FIELD.build_vocab(train, validation, test)
            cls.TEXT_FIELD = cls.set_vectors(cls.TEXT_FIELD, os.path.join(vectors_dir, vectors_name))

        cls.LABEL_FIELD.build_vocab(train, validation, test)

        cls.VOCAB_SIZE = len(cls.TEXT_FIELD.vocab)

        return BucketIterator.splits((train, validation, test), batch_size=batch_size, repeat=False, shuffle=shuffle,
                                     sort_within_batch=True, device=device)
Exemple #2
0
    def iters(cls, batch_size: int =32, device: int = 0, root: str ='.data',
              vectors: Tensor = None, **kwargs) -> Tuple[Iterator, Iterator, Iterator]:

        text = Field()
        label = Field(sequential=False)

        train, valid, test = cls.splits(text, label, root=root, **kwargs)

        text.build_vocab(train, vectors=vectors)
        label.build_vocab(train)

        return BucketIterator.splits(
            (train, test), batch_size=batch_size, device=device)
Exemple #3
0
class SICK(CastorPairDataset):
    NAME = 'sick'
    NUM_CLASSES = 5
    ID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True)
    TEXT_FIELD = Field(batch_first=True, tokenize=lambda x: x)  # tokenizer is identity since we already tokenized it to compute external features
    EXT_FEATS_FIELD = Field(dtype=torch.FloatTensor, use_vocab=False, batch_first=True, tokenize=lambda x: x)
    LABEL_FIELD = Field(sequential=False, dtype=torch.FloatTensor, use_vocab=False, batch_first=True, postprocessing=Pipeline(get_class_probs))
    RAW_TEXT_FIELD = RawField()

    @staticmethod
    def sort_key(ex):
        return len(ex.sentence_1)

    def __init__(self, path):
        """
        Create a SICK dataset instance
        """
        super().__init__(path)

    @classmethod
    def splits(cls, path, train='train', validation='dev', test='test', **kwargs):
        return super(SICK, cls).splits(path, train=train, validation=validation, test=test, **kwargs)

    @classmethod
    def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_):
        """
        :param path: directory containing train, test, dev files
        :param vectors_name: name of word vectors file
        :param vectors_cache: path to word vectors file
        :param batch_size: batch size
        :param device: GPU device
        :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes
        :param unk_init: function used to generate vector for OOV words
        :return:
        """
        if vectors is None:
            vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init)

        train, val, test = cls.splits(path)

        cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors)

        return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle,
                                     sort_within_batch=True, device=device)
Exemple #4
0
class Semeval(Dataset):
    NAME = 'Semeval'
    NUM_CLASSES = 2
    QID_FIELD = Field(sequential=False,
                      tensor_type=torch.FloatTensor,
                      use_vocab=False,
                      batch_first=True)
    QAID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True)
    TEXT_FIELD = Field(
        batch_first=True, tokenize=lambda x: x
    )  # tokenizer is identity since we already tokenized it to compute external features
    EXT_FEATS_FIELD = Field(sequential=False,
                            tensor_type=torch.FloatTensor,
                            use_vocab=False,
                            batch_first=True,
                            tokenize=lambda x: x)
    LABEL_FIELD = Field(sequential=False,
                        tensor_type=torch.FloatTensor,
                        use_vocab=False,
                        batch_first=True,
                        postprocessing=Pipeline(get_class_probs))
    RAW_TEXT_FIELD = RawField()

    @staticmethod
    def sort_key(ex):
        return len(ex.sentence_1)

    def __init__(self, path, **kwargs):
        """
        Create a Semeval dataset instance
        """

        fields = [('qid', self.QID_FIELD), ('qaid', self.QID_FIELD),
                  ('label', self.LABEL_FIELD), ('sentence_1', self.TEXT_FIELD),
                  ('sentence_2', self.TEXT_FIELD),
                  ('sentence_1_raw', self.RAW_TEXT_FIELD),
                  ('sentence_2_raw', self.RAW_TEXT_FIELD),
                  ('ext_feats', self.EXT_FEATS_FIELD)]

        examples = []

        with open(path) as infile:
            for line in infile:
                content = json.loads(line)

                sent_list_1 = content['question']
                sent_list_2 = content['qaquestion']

                word_to_doc_cnt = get_pairwise_word_to_doc_freq(
                    sent_list_1, sent_list_2)
                overlap_feats = get_pairwise_overlap_features(
                    sent_list_1, sent_list_2, word_to_doc_cnt)
                overlap_feats = []
                values = [
                    content['qid'], content['qaid'], content['qarel'],
                    content['question'], content['qaquestion'],
                    ' '.join(content['question']),
                    ' '.join(content['qaquestion']), overlap_feats
                ]

                examples.append(Example.fromlist(values, fields))

        super(Semeval, self).__init__(examples, fields, **kwargs)

    @classmethod
    def splits(cls,
               path,
               train='train_2016.json',
               validation='dev_2016.json',
               test='test_2017.json',
               **kwargs):
        return super(Semeval, cls).splits(path,
                                          train=train,
                                          validation=validation,
                                          test=test,
                                          **kwargs)

    @classmethod
    def iters(cls,
              path,
              vectors_name,
              vectors_cache,
              batch_size=64,
              shuffle=True,
              device=0,
              vectors=None,
              unk_init=torch.Tensor.zero_):
        """
        :param path: directory containing train, test, dev files
        :param vectors_name: name of word vectors file
        :param vectors_dir: directory containing word vectors file
        :param batch_size: batch size
        :param device: GPU device
        :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes
        :param pt_file: load cached embedding file from disk if it is true
        :param unk_init: function used to generate vector for OOV words
        :return:
        """

        if vectors is None:
            vectors = Vectors(name=vectors_name,
                              cache=vectors_cache,
                              unk_init=unk_init)

        train, validation, test = cls.splits(path)

        cls.LABEL_FIELD.build_vocab(train, validation, test)
        cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors)
        return BucketIterator.splits((train, validation, test),
                                     batch_size=batch_size,
                                     repeat=False,
                                     shuffle=shuffle,
                                     sort_within_batch=True,
                                     device=device)
Exemple #5
0
                    choices=['DROPOUT', 'BN_RELU', 'RELU_BN'],
                    default='DROPOUT',
                    type=str)
parser.add_argument('--dropout', action='store', default=0.5, type=float)
parser.add_argument('--wd', action='store', default=1e-4, type=float)
parser.add_argument('--model-conf', action='store', default=None, type=str)
params = parser.parse_args()

kvs = [(k, v) for k, v in vars(params).items()]
kvs.append(('Device', device))

print_kv_box('Current Configuration', kvs)

if params.mode == 'debug':
    tokenizer = WordToCharTokenizer()
    text_field = Field(tokenize=tokenizer, batch_first=True)
    ds = RandomizedTextWindowDataset(params.dataset,
                                     text_field,
                                     params.window_size,
                                     topk=params.topk,
                                     newline_eos=False)
    text_field.build_vocab(ds)
    train_ds, test_ds = ds.split(0.8)
    iterator = NoisedPreWindowedIterator(train_ds, params.batch_size,
                                         params.window_size, 0.0)
    iterator = PredictMiddleNoisedWindowIterator(iterator, 1)
    for b in iterator:
        print(b)
    i = 1
    # model = MLP(51, 27, 1024, 3)
    # text_field = Field(tokenize=tokenize, batch_first=True)
Exemple #6
0
class TRECQA(CastorPairDataset):
    NAME = 'trecqa'
    NUM_CLASSES = 2
    ID_FIELD = Field(sequential=False,
                     tensor_type=torch.FloatTensor,
                     use_vocab=False,
                     batch_first=True)
    AID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True)
    TEXT_FIELD = Field(
        batch_first=True, tokenize=lambda x: x
    )  # tokenizer is identity since we already tokenized it to compute external features
    EXT_FEATS_FIELD = Field(
        tensor_type=torch.FloatTensor,
        use_vocab=False,
        batch_first=True,
        tokenize=lambda x: x,
        postprocessing=Pipeline(lambda arr, _, train: [float(y) for y in arr]))
    LABEL_FIELD = Field(sequential=False, use_vocab=False, batch_first=True)
    VOCAB_SIZE = 0

    @staticmethod
    def sort_key(ex):
        return len(ex.sentence_1)

    def __init__(self, path):
        """
        Create a TRECQA dataset instance
        """
        super(TRECQA, self).__init__(path, load_ext_feats=True)

    @classmethod
    def splits(cls,
               path,
               train='train-all',
               validation='raw-dev',
               test='raw-test',
               **kwargs):
        return super(TRECQA, cls).splits(path,
                                         train=train,
                                         validation=validation,
                                         test=test,
                                         **kwargs)

    @classmethod
    def set_vectors(cls, field, vector_path):
        if os.path.isfile(vector_path):
            stoi, vectors, dim = torch.load(vector_path)
            field.vocab.vectors = torch.Tensor(len(field.vocab), dim)

            for i, token in enumerate(field.vocab.itos):
                wv_index = stoi.get(token, None)
                if wv_index is not None:
                    field.vocab.vectors[i] = vectors[wv_index]
                else:
                    # initialize <unk> with uniform_(-0.05, 0.05) vectors
                    field.vocab.vectors[i] = torch.FloatTensor(dim).uniform_(
                        -0.05, 0.05)
        else:
            print("Error: Need word embedding pt file")
            exit(1)
        return field

    @classmethod
    def iters(cls,
              path,
              vectors_name,
              vectors_dir,
              batch_size=64,
              shuffle=True,
              device=0,
              pt_file=False,
              vectors=None,
              unk_init=torch.Tensor.zero_):
        """
        :param path: directory containing train, test, dev files
        :param vectors_name: name of word vectors file
        :param vectors_dir: directory containing word vectors file
        :param batch_size: batch size
        :param device: GPU device
        :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes
        :param unk_init: function used to generate vector for OOV words
        :return:
        """

        train, validation, test = cls.splits(path)
        if not pt_file:
            if vectors is None:
                vectors = Vectors(name=vectors_name,
                                  cache=vectors_dir,
                                  unk_init=unk_init)
            cls.TEXT_FIELD.build_vocab(train,
                                       validation,
                                       test,
                                       vectors=vectors)
        else:
            cls.TEXT_FIELD.build_vocab(train, validation, test)
            cls.TEXT_FIELD = cls.set_vectors(
                cls.TEXT_FIELD, os.path.join(vectors_dir, vectors_name))

        cls.LABEL_FIELD.build_vocab(train, validation, test)

        cls.VOCAB_SIZE = len(cls.TEXT_FIELD.vocab)

        return BucketIterator.splits((train, validation, test),
                                     batch_size=batch_size,
                                     repeat=False,
                                     shuffle=shuffle,
                                     device=device)
Exemple #7
0
class MSRP(Dataset):
    NAME = 'msrp'
    NUM_CLASSES = 2
    EXT_FEATS = 6
    ID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True)
    TEXT_FIELD = Field(batch_first=True, tokenize=lambda x: x)  # tokenizer is identity since we already tokenized it
    EXT_FEATS_FIELD = Field(dtype=torch.float32, use_vocab=False, batch_first=True, tokenize=lambda x: x)
    LABEL_FIELD = Field(sequential=False, use_vocab=False, batch_first=True)
    RAW_TEXT_FIELD = RawField()

    NUMBER_PATTERN = re.compile(r'((\d+,)*\d+\.?\d*)')

    @staticmethod
    def sort_key(ex):
        return len(ex.sentence_1)

    def __init__(self, path):
        """
        Create a MSRP dataset instance
        """
        fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD),
                ('label', self.LABEL_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)]

        examples = []
        with open(os.path.join(path, 'a.toks'), 'r') as f1, open(os.path.join(path, 'b.toks'), 'r') as f2:
            sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1]
            sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2]

        word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2)
        self.word_to_doc_cnt = word_to_doc_cnt

        with open(os.path.join(path, 'id.txt'), 'r') as id_file, open(os.path.join(path, 'sim.txt'), 'r') as label_file:
            for pair_id, l1, l2, label in zip(id_file, sent_list_1, sent_list_2, label_file):
                pair_id = pair_id.rstrip('.\n')
                label = label.rstrip('.\n')
                ext_feats = []

                # Number features
                sent1_nums, sent2_nums = [], []
                match = self.NUMBER_PATTERN.search(' '.join(l1))
                if match:
                    for g in match.groups():
                        if g is not None:
                            sent1_nums.append(g)

                match = self.NUMBER_PATTERN.search(' '.join(l2))
                if match:
                    for g in match.groups():
                        if g is not None:
                            sent2_nums.append(g)

                sent1_nums = set(sent1_nums)
                sent2_nums = set(sent2_nums)
                exact = int(sent1_nums == sent2_nums)
                superset = int(sent1_nums.issuperset(sent2_nums) or sent2_nums.issuperset(sent1_nums))
                ext_feats.append(1 if (exact or (len(sent1_nums) == 0 and len(sent2_nums) == 0)) else 0)
                ext_feats.append(exact)
                ext_feats.append(superset)

                # Length difference
                ext_feats.append(len(l2) - len(l1))

                # Overlap
                overlap = len(set(l1) & set(l2))
                ext_feats.append(overlap / len(l1))
                ext_feats.append(overlap / len(l2))

                example = Example.fromlist([pair_id, l1, l2, ext_feats, label, ' '.join(l1), ' '.join(l2)], fields)
                examples.append(example)

        super(MSRP, self).__init__(examples, fields)

    @classmethod
    def _read_file(cls, fn):
        lines = []
        with open(fn, 'r') as f:
            for line in f:
                lines.append(line)
        return lines

    @classmethod
    def splits(cls, path, train='train', test='test', **kwargs):
        # Create temporary files to split train into train and dev
        uid = uuid.uuid4()
        train_tmp, dev_tmp = f'{train}-tmp-{uid}', f'dev-tmp-{uid}'
        pathlib.Path(os.path.join(path, train_tmp)).mkdir(parents=True, exist_ok=True)
        pathlib.Path(os.path.join(path, dev_tmp)).mkdir(parents=True, exist_ok=True)

        train_id = cls._read_file(os.path.join(path, train, 'id.txt'))
        train_a_toks = cls._read_file(os.path.join(path, train, 'a.toks'))
        train_b_toks = cls._read_file(os.path.join(path, train, 'b.toks'))
        train_sim = cls._read_file(os.path.join(path, train, 'sim.txt'))

        dev_lines = np.random.choice(np.arange(len(train_id)), size=400, replace=False)

        train_tmp_id_path = os.path.join(path, train_tmp, 'id.txt')
        train_tmp_sim_path = os.path.join(path, train_tmp, 'sim.txt')
        train_tmp_a_toks = os.path.join(path, train_tmp, 'a.toks')
        train_tmp_b_toks = os.path.join(path, train_tmp, 'b.toks')
        dev_tmp_id_path = os.path.join(path, dev_tmp, 'id.txt')
        dev_tmp_sim_path = os.path.join(path, dev_tmp, 'sim.txt')
        dev_tmp_a_toks = os.path.join(path, dev_tmp, 'a.toks')
        dev_tmp_b_toks = os.path.join(path, dev_tmp, 'b.toks')

        counter = 0

        with open(train_tmp_id_path, 'w') as tid, open(train_tmp_sim_path, 'w') as tsim, open(train_tmp_a_toks, 'w') as ta, open(train_tmp_b_toks, 'w') as tb,\
                open(dev_tmp_id_path, 'w') as did, open(dev_tmp_sim_path, 'w') as dsim, open(dev_tmp_a_toks, 'w') as da, open(dev_tmp_b_toks, 'w') as db:
            for i, (pid, sa, sb, sim) in enumerate(zip(train_id, train_a_toks, train_b_toks, train_sim)):
                counter += 1
                if i in dev_lines:
                    did.write(pid)
                    dsim.write(sim)
                    da.write(sa)
                    db.write(sb)
                else:
                    tid.write(pid)
                    tsim.write(sim)
                    ta.write(sa)
                    tb.write(sb)

        split_results = super(MSRP, cls).splits(path, train=train_tmp, validation=dev_tmp, test=test, **kwargs)

        shutil.rmtree(os.path.join(path, train_tmp))
        shutil.rmtree(os.path.join(path, dev_tmp))

        return split_results

    @classmethod
    def iters(cls, path, vectors_name, vectors_cache, device, batch_size=64, shuffle=True, vectors=None, unk_init=torch.Tensor.zero_):
        """
        :param path: directory containing train, test, dev files
        :param vectors_name: name of word vectors file
        :param vectors_cache: path to word vectors file
        :param device: PyTorch device
        :param batch_size: batch size
        :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes
        :param unk_init: function used to generate vector for OOV words
        :return:
        """
        if vectors is None:
            vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init)

        train, validation, test = cls.splits(path)

        cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors)

        return BucketIterator.splits((train, validation, test), batch_size=batch_size, repeat=False, shuffle=shuffle, device=device)