return list(self._read(texts)) def _read(self, texts): self._nb_examples = 0 if not isinstance(texts, (list, tuple)): texts = [texts] for text in texts: self._nb_examples += 1 yield self.make_torchtext_example(text) def make_torchtext_example(self, text): ex = {'words': text} return torchtext.data.Example.fromdict(ex, self.fields_dict) if __name__ == '__main__': from spec.dataset.corpora.test_corpus import quick_test raw_texts = [ 'Lorem ipsum dolor sit amet, consectetur adipisicing elit', 'tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim', 'quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea', 'consequat. Duis aute irure dolor in reprehenderit in voluptate velit', 'cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat', 'proident, sunt in culpa qui officia deserunt mollit anim id est.' ] quick_test( TextCorpus, raw_texts, lazy=True, )
'words': ' '.join(subtree.leaves()), 'target': self._get_label(subtree.label()) } if 'target' not in self.fields_dict.keys(): del ex['target'] assert ex.keys() == self.fields_dict.keys() examples.append( torchtext.data.Example.fromdict(ex, self.fields_dict)) return examples else: ex = { 'words': ' '.join(tree.leaves()), 'target': self._get_label(tree.label()) } if 'target' not in self.fields_dict.keys(): del ex['target'] assert ex.keys() == self.fields_dict.keys() return [torchtext.data.Example.fromdict(ex, self.fields_dict)] def _get_label(self, label): return self.granularity_map[label] if __name__ == '__main__': from spec.dataset.corpora.test_corpus import quick_test quick_test(SSTCorpus, '../../../data/corpus/sst/train.txt', lazy=True, subtrees=False, granularity='2')
self.corpus_path = str(new_file_path) self.open(self.corpus_path) if self.lazy is True: return self else: return list(self) def _read(self, file): for line in file: line = line.strip().split() if line: label = line[0] text = ' '.join(line[1:]) yield self.make_torchtext_example(text, label) def make_torchtext_example(self, text, label=None): ex = {'words': text, 'target': label} if 'target' not in self.fields_dict.keys(): del ex['target'] assert ex.keys() == self.fields_dict.keys() return torchtext.data.Example.fromdict(ex, self.fields_dict) if __name__ == '__main__': from spec.dataset.corpora.test_corpus import quick_test quick_test( IMDBCorpus, '../../../data/corpus/imdb/test/', lazy=True, )
def create_fields_tuples(): tokenizer = nltk.WordPunctTokenizer() fields_tuples = [('words', fields.WordsField(tokenize=tokenizer.tokenize)), ('target', fields.TagsField())] return fields_tuples def _read(self, file): for line in file: data = json.loads(line.strip()) text = data['text'] label = str(int(data['stars'])) example = self.make_torchtext_example(text, label) yield example def make_torchtext_example(self, text, label=None): ex = {'words': text, 'target': label} if 'target' not in self.fields_dict.keys(): del ex['target'] assert ex.keys() == self.fields_dict.keys() return torchtext.data.Example.fromdict(ex, self.fields_dict) if __name__ == '__main__': from spec.dataset.corpora.test_corpus import quick_test quick_test( YelpCorpus, '../../../data/corpus/yelp/review_train.json', lazy=True, )
class TTSBRCorpus(Corpus): @staticmethod def create_fields_tuples(): fields_tuples = [('words', fields.WordsField()), ('target', fields.TagsField())] return fields_tuples def _read(self, file): for line in file: line = line.strip().split() label = line[0] text = ' '.join(line[2:]) yield self.make_torchtext_example(text, label) def make_torchtext_example(self, text, label=None): ex = {'words': text, 'target': label} if 'target' not in self.fields_dict.keys(): del ex['target'] assert ex.keys() == self.fields_dict.keys() return torchtext.data.Example.fromdict(ex, self.fields_dict) if __name__ == '__main__': from spec.dataset.corpora.test_corpus import quick_test quick_test( TTSBRCorpus, '../../../data/corpus/ttsbr/trainTT.txt', lazy=True, )
return fields_tuples def _read(self, file): root = ElementTree.parse(file).getroot() categories = [x.text for x in root.iter('category')] descriptions = [x.text for x in root.iter('description')] for text, label in zip(descriptions, categories): if text is None or label is None: continue # business vs world (binary classification) if label not in ['Business', 'World']: continue text = re.sub("\\\\", "", text) # fix escape yield self.make_torchtext_example(text, label) def make_torchtext_example(self, text, label=None): ex = {'words': text, 'target': label} if 'target' not in self.fields_dict.keys(): del ex['target'] assert ex.keys() == self.fields_dict.keys() return torchtext.data.Example.fromdict(ex, self.fields_dict) if __name__ == '__main__': from spec.dataset.corpora.test_corpus import quick_test quick_test( AGNewsCorpus, '../../../data/corpus/agnews/test.xml', lazy=False, )
from spec.dataset.corpora.snli import SNLICorpus class MNLICorpus(SNLICorpus): pass if __name__ == '__main__': from spec.dataset.corpora.test_corpus import quick_test quick_test( MNLICorpus, '../../../data/corpus/mnli/multinli_1.0_dev_matched.jsonl', lazy=True, )
('target', fields.TagsField())] return fields_tuples def _read(self, file): for line in file: data = json.loads(line) label = data['gold_label'] premise = data['sentence1'] hypothesis = data['sentence2'] if label == '-': # These were cases where the annotators disagreed; we'll just # skip them. It's like 800 / 500k examples in the train data continue yield self.make_torchtext_example(premise, hypothesis, label) def make_torchtext_example(self, prem, hyp, label): ex = {'words': prem, 'words_hyp': hyp, 'target': label} if 'target' not in self.fields_dict.keys(): del ex['target'] assert ex.keys() == self.fields_dict.keys() return torchtext.data.Example.fromdict(ex, self.fields_dict) if __name__ == '__main__': from spec.dataset.corpora.test_corpus import quick_test quick_test( SNLICorpus, '../../../data/corpus/snli/snli_1.0_test.jsonl', lazy=True, )