def preprocess(self, x): """Load a single example using this field, tokenizing if necessary. If the input is a Python 2 `str`, it will be converted to Unicode first. If `sequential=True`, it will be tokenized. Then the input will be optionally lowercased and passed to the user-provided `preprocessing` Pipeline.""" if (six.PY2 and isinstance(x, six.string_types) and not isinstance(x, six.text_type)): x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x) if isinstance(x, list): # cue knowledge is list of sentences x = [self.tokenize(t.rstrip('\n')) for t in x] elif self.sequential and isinstance(x, six.text_type): x = self.tokenize(x.rstrip('\n')) # if self.sequential and isinstance(x, six.text_type): # x = self.tokenize(x.rstrip('\n')) if self.lower: x = Pipeline(six.text_type.lower)(x) if self.sequential and self.use_vocab and self.stop_words is not None: x = [w for w in x if w not in self.stop_words] if self.preprocessing is not None: return self.preprocessing(x) else: return x
def iters(cls, batch_size=64, device=-1, shuffle=True, vectors='glove.840B.300d'): cls.TEXT = Field(sequential=True, tokenize='spacy', lower=True, batch_first=True) cls.LABEL = Field(sequential=False, use_vocab=False, batch_first=True, tensor_type=torch.FloatTensor, postprocessing=Pipeline(get_class_probs)) cls.ID = RawField() train, val, test = cls.splits(cls.TEXT, cls.LABEL, cls.ID) cls.TEXT.build_vocab(train, vectors=vectors) return BucketIterator.splits((train, val, test), batch_size=batch_size, shuffle=shuffle, repeat=False, device=device)
def preprocess(self, x): if (six.PY2 and isinstance(x, six.string_types) and not isinstance(x, six.text_type)): x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x) # This is design for numerical tensor with different length, # So this string will be tokenized x = self.tokenize(x) return x
def preprocess(self, x): """Load a single example using this field, tokenizing if necessary. If the input is a Python 2 `str`, it will be converted to Unicode first. If `sequential=True`, it will be tokenized. Then the input will be optionally lowercased and passed to the user-provided `preprocessing` Pipeline.""" if (six.PY2 and isinstance(x, six.string_types) and not isinstance(x, six.text_type)): x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x) # will strip and then split here! if self.sequential and isinstance(x, six.text_type): x = self.tokenize(x.rstrip('\n')) if self.lower: x = Pipeline(six.text_type.lower)(x) if self.preprocessing is not None: return self.preprocessing(x) else: return x
def preprocess(self, x): if (six.PY2 and isinstance(x, six.string_types) and not isinstance(x, six.text_type)): x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x) # This is design for numerical tensor with different length, # So this string will be tokenized array = self.tokenize1d(x) maxtrix = [self.tokenize2d(a) for a in array] tensor = [[self.tokenize3d(t) for t in m] for m in maxtrix] return tensor
class SICK(CastorPairDataset): NAME = 'sick' NUM_CLASSES = 5 ID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True) TEXT_FIELD = Field( batch_first=True, tokenize=lambda x: x ) # tokenizer is identity since we already tokenized it to compute external features EXT_FEATS_FIELD = Field(tensor_type=torch.FloatTensor, use_vocab=False, batch_first=True, tokenize=lambda x: x) LABEL_FIELD = Field(sequential=False, tensor_type=torch.FloatTensor, use_vocab=False, batch_first=True, postprocessing=Pipeline(get_class_probs)) RAW_TEXT_FIELD = RawField() @staticmethod def sort_key(ex): return len(ex.sentence_1) def __init__(self, path): """ Create a SICK dataset instance """ super(SICK, self).__init__(path) @classmethod def splits(cls, path, train='train', validation='dev', test='test', **kwargs): return super(SICK, cls).splits(path, train=train, validation=validation, test=test, **kwargs) @classmethod def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, val, test = cls.splits(path) cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, device=device)
class Semeval(Dataset): NAME = 'Semeval' NUM_CLASSES = 2 QID_FIELD = Field(sequential=False, tensor_type=torch.FloatTensor, use_vocab=False, batch_first=True) QAID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True) TEXT_FIELD = Field( batch_first=True, tokenize=lambda x: x ) # tokenizer is identity since we already tokenized it to compute external features EXT_FEATS_FIELD = Field(sequential=False, tensor_type=torch.FloatTensor, use_vocab=False, batch_first=True, tokenize=lambda x: x) LABEL_FIELD = Field(sequential=False, tensor_type=torch.FloatTensor, use_vocab=False, batch_first=True, postprocessing=Pipeline(get_class_probs)) RAW_TEXT_FIELD = RawField() @staticmethod def sort_key(ex): return len(ex.sentence_1) def __init__(self, path, **kwargs): """ Create a Semeval dataset instance """ fields = [('qid', self.QID_FIELD), ('qaid', self.QID_FIELD), ('label', self.LABEL_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD)] examples = [] with open(path) as infile: for line in infile: content = json.loads(line) sent_list_1 = content['question'] sent_list_2 = content['qaquestion'] word_to_doc_cnt = get_pairwise_word_to_doc_freq( sent_list_1, sent_list_2) overlap_feats = get_pairwise_overlap_features( sent_list_1, sent_list_2, word_to_doc_cnt) overlap_feats = [] values = [ content['qid'], content['qaid'], content['qarel'], content['question'], content['qaquestion'], ' '.join(content['question']), ' '.join(content['qaquestion']), overlap_feats ] examples.append(Example.fromlist(values, fields)) super(Semeval, self).__init__(examples, fields, **kwargs) @classmethod def splits(cls, path, train='train_2016.json', validation='dev_2016.json', test='test_2017.json', **kwargs): return super(Semeval, cls).splits(path, train=train, validation=validation, test=test, **kwargs) @classmethod def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_dir: directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param pt_file: load cached embedding file from disk if it is true :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, validation, test = cls.splits(path) cls.LABEL_FIELD.build_vocab(train, validation, test) cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors) return BucketIterator.splits((train, validation, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)