Beispiel #1
0
    def preprocess(self, x):
        """Load a single example using this field, tokenizing if necessary.

        If the input is a Python 2 `str`, it will be converted to Unicode
        first. If `sequential=True`, it will be tokenized. Then the input
        will be optionally lowercased and passed to the user-provided
        `preprocessing` Pipeline."""
        if (six.PY2 and isinstance(x, six.string_types)
                and not isinstance(x, six.text_type)):
            x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x)
        if isinstance(x, list):     # cue knowledge is list of sentences
            x = [self.tokenize(t.rstrip('\n')) for t in x]
        elif self.sequential and isinstance(x, six.text_type):
            x = self.tokenize(x.rstrip('\n'))

        # if self.sequential and isinstance(x, six.text_type):
        #     x = self.tokenize(x.rstrip('\n'))

        if self.lower:
            x = Pipeline(six.text_type.lower)(x)
        if self.sequential and self.use_vocab and self.stop_words is not None:
            x = [w for w in x if w not in self.stop_words]
        if self.preprocessing is not None:
            return self.preprocessing(x)
        else:
            return x
Beispiel #2
0
    def iters(cls,
              batch_size=64,
              device=-1,
              shuffle=True,
              vectors='glove.840B.300d'):
        cls.TEXT = Field(sequential=True,
                         tokenize='spacy',
                         lower=True,
                         batch_first=True)
        cls.LABEL = Field(sequential=False,
                          use_vocab=False,
                          batch_first=True,
                          tensor_type=torch.FloatTensor,
                          postprocessing=Pipeline(get_class_probs))
        cls.ID = RawField()

        train, val, test = cls.splits(cls.TEXT, cls.LABEL, cls.ID)

        cls.TEXT.build_vocab(train, vectors=vectors)

        return BucketIterator.splits((train, val, test),
                                     batch_size=batch_size,
                                     shuffle=shuffle,
                                     repeat=False,
                                     device=device)
Beispiel #3
0
 def preprocess(self, x):
     if (six.PY2 and isinstance(x, six.string_types)
             and not isinstance(x, six.text_type)):
         x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x)
     # This is design for numerical tensor with different length,
     # So this string will be tokenized
     x = self.tokenize(x)
     return x
Beispiel #4
0
    def preprocess(self, x):
        """Load a single example using this field, tokenizing if necessary.

        If the input is a Python 2 `str`, it will be converted to Unicode
        first. If `sequential=True`, it will be tokenized. Then the input
        will be optionally lowercased and passed to the user-provided
        `preprocessing` Pipeline."""
        if (six.PY2 and isinstance(x, six.string_types)
                and not isinstance(x, six.text_type)):
            x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x)
        # will strip and then split here!
        if self.sequential and isinstance(x, six.text_type):
            x = self.tokenize(x.rstrip('\n'))
        if self.lower:
            x = Pipeline(six.text_type.lower)(x)
        if self.preprocessing is not None:
            return self.preprocessing(x)
        else:
            return x
Beispiel #5
0
 def preprocess(self, x):
     if (six.PY2 and isinstance(x, six.string_types)
             and not isinstance(x, six.text_type)):
         x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x)
     # This is design for numerical tensor with different length,
     # So this string will be tokenized
     array = self.tokenize1d(x)
     maxtrix = [self.tokenize2d(a) for a in array]
     tensor = [[self.tokenize3d(t) for t in m] for m in maxtrix]
     return tensor
Beispiel #6
0
class SICK(CastorPairDataset):
    NAME = 'sick'
    NUM_CLASSES = 5
    ID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True)
    TEXT_FIELD = Field(
        batch_first=True, tokenize=lambda x: x
    )  # tokenizer is identity since we already tokenized it to compute external features
    EXT_FEATS_FIELD = Field(tensor_type=torch.FloatTensor,
                            use_vocab=False,
                            batch_first=True,
                            tokenize=lambda x: x)
    LABEL_FIELD = Field(sequential=False,
                        tensor_type=torch.FloatTensor,
                        use_vocab=False,
                        batch_first=True,
                        postprocessing=Pipeline(get_class_probs))
    RAW_TEXT_FIELD = RawField()

    @staticmethod
    def sort_key(ex):
        return len(ex.sentence_1)

    def __init__(self, path):
        """
        Create a SICK dataset instance
        """
        super(SICK, self).__init__(path)

    @classmethod
    def splits(cls,
               path,
               train='train',
               validation='dev',
               test='test',
               **kwargs):
        return super(SICK, cls).splits(path,
                                       train=train,
                                       validation=validation,
                                       test=test,
                                       **kwargs)

    @classmethod
    def iters(cls,
              path,
              vectors_name,
              vectors_cache,
              batch_size=64,
              shuffle=True,
              device=0,
              vectors=None,
              unk_init=torch.Tensor.zero_):
        """
        :param path: directory containing train, test, dev files
        :param vectors_name: name of word vectors file
        :param vectors_cache: path to word vectors file
        :param batch_size: batch size
        :param device: GPU device
        :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes
        :param unk_init: function used to generate vector for OOV words
        :return:
        """
        if vectors is None:
            vectors = Vectors(name=vectors_name,
                              cache=vectors_cache,
                              unk_init=unk_init)

        train, val, test = cls.splits(path)

        cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors)

        return BucketIterator.splits((train, val, test),
                                     batch_size=batch_size,
                                     repeat=False,
                                     shuffle=shuffle,
                                     device=device)
Beispiel #7
0
class Semeval(Dataset):
    NAME = 'Semeval'
    NUM_CLASSES = 2
    QID_FIELD = Field(sequential=False,
                      tensor_type=torch.FloatTensor,
                      use_vocab=False,
                      batch_first=True)
    QAID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True)
    TEXT_FIELD = Field(
        batch_first=True, tokenize=lambda x: x
    )  # tokenizer is identity since we already tokenized it to compute external features
    EXT_FEATS_FIELD = Field(sequential=False,
                            tensor_type=torch.FloatTensor,
                            use_vocab=False,
                            batch_first=True,
                            tokenize=lambda x: x)
    LABEL_FIELD = Field(sequential=False,
                        tensor_type=torch.FloatTensor,
                        use_vocab=False,
                        batch_first=True,
                        postprocessing=Pipeline(get_class_probs))
    RAW_TEXT_FIELD = RawField()

    @staticmethod
    def sort_key(ex):
        return len(ex.sentence_1)

    def __init__(self, path, **kwargs):
        """
        Create a Semeval dataset instance
        """

        fields = [('qid', self.QID_FIELD), ('qaid', self.QID_FIELD),
                  ('label', self.LABEL_FIELD), ('sentence_1', self.TEXT_FIELD),
                  ('sentence_2', self.TEXT_FIELD),
                  ('sentence_1_raw', self.RAW_TEXT_FIELD),
                  ('sentence_2_raw', self.RAW_TEXT_FIELD),
                  ('ext_feats', self.EXT_FEATS_FIELD)]

        examples = []

        with open(path) as infile:
            for line in infile:
                content = json.loads(line)

                sent_list_1 = content['question']
                sent_list_2 = content['qaquestion']

                word_to_doc_cnt = get_pairwise_word_to_doc_freq(
                    sent_list_1, sent_list_2)
                overlap_feats = get_pairwise_overlap_features(
                    sent_list_1, sent_list_2, word_to_doc_cnt)
                overlap_feats = []
                values = [
                    content['qid'], content['qaid'], content['qarel'],
                    content['question'], content['qaquestion'],
                    ' '.join(content['question']),
                    ' '.join(content['qaquestion']), overlap_feats
                ]

                examples.append(Example.fromlist(values, fields))

        super(Semeval, self).__init__(examples, fields, **kwargs)

    @classmethod
    def splits(cls,
               path,
               train='train_2016.json',
               validation='dev_2016.json',
               test='test_2017.json',
               **kwargs):
        return super(Semeval, cls).splits(path,
                                          train=train,
                                          validation=validation,
                                          test=test,
                                          **kwargs)

    @classmethod
    def iters(cls,
              path,
              vectors_name,
              vectors_cache,
              batch_size=64,
              shuffle=True,
              device=0,
              vectors=None,
              unk_init=torch.Tensor.zero_):
        """
        :param path: directory containing train, test, dev files
        :param vectors_name: name of word vectors file
        :param vectors_dir: directory containing word vectors file
        :param batch_size: batch size
        :param device: GPU device
        :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes
        :param pt_file: load cached embedding file from disk if it is true
        :param unk_init: function used to generate vector for OOV words
        :return:
        """

        if vectors is None:
            vectors = Vectors(name=vectors_name,
                              cache=vectors_cache,
                              unk_init=unk_init)

        train, validation, test = cls.splits(path)

        cls.LABEL_FIELD.build_vocab(train, validation, test)
        cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors)
        return BucketIterator.splits((train, validation, test),
                                     batch_size=batch_size,
                                     repeat=False,
                                     shuffle=shuffle,
                                     sort_within_batch=True,
                                     device=device)