def main(): parser = argparse.ArgumentParser(description='translate.py') parser.add_argument('-data_pkl', required=True, help='Pickle file with vocabulary.') parser.add_argument('-trg_data', default='PSLG-PC12/ENG-ASL_Test.en') parser.add_argument('-pred_data', default='predictions.txt', help="""Path to output the predictions (each line will be the decoded sequence""") opt = parser.parse_args() data = pickle.load(open(opt.data_pkl, 'rb')) SRC, TRG = data['vocab']['src'], data['vocab']['trg'] fields = [('src', SRC)] with open(opt.trg_data, 'r') as f: trg_loader = Dataset( examples=[Example.fromlist([x], fields) for x in f], fields={'src': SRC}) trg_txt = [x.src for x in trg_loader] with open(opt.pred_data, 'r') as f: pred_loader = Dataset( examples=[Example.fromlist([x], fields) for x in f], fields={'src': SRC}) pred_txt = [[x.src] for x in pred_loader] score = bleu_score(trg_txt, pred_txt) print('Bleu 4 score is {}'.format(str(score))) with open('bleu_score.txt', 'w') as f: f.write('Bleu 4 score is {}'.format(str(score)))
class TestSimpleIterator(object): TEXT = Field() examples = [ Example.fromlist(['John loves Mary'], [('text', TEXT)]), Example.fromlist(['Mary cries'], [('text', TEXT)]), ] dataset = Dataset(examples, [('text', TEXT)]) TEXT.build_vocab(dataset) def make_iterator(self): return SimpleIterator(self.dataset, device=-1) def test_init_minimal(self): iterator = SimpleIterator(self.dataset) assert iterator.dataset is self.dataset assert iterator.batch_size == 1 assert iterator.train assert iterator.device is None assert iterator.sort_key is None assert not iterator.sort assert not iterator.repeat assert iterator.shuffle == iterator.train assert not iterator.sort_within_batch def test_init_full(self): iterator = SimpleIterator(self.dataset, train=False, device=-1) assert not iterator.train assert iterator.device == -1 def test_next(self): iterator = self.make_iterator() sample = next(iter(iterator)) assert isinstance(sample.text, Variable) assert sample.text.size(1) == 1
def stratified_sampler(train, test, target, text_field, label_field): shuffler = StratifiedShuffleSplit(n_splits=1, train_size=0.7, test_size=0.30) X = [] y = [] fields = [('text', text_field), (target[0], label_field)] for example in train: X.append(getattr(example, "text")) y.append(getattr(example, target[0])) for example in test: X.append(getattr(example, "text")) y.append(getattr(example, target[0])) train_idx, test_idx = list(shuffler.split(X, y))[0] trn = Dataset( examples=[Example.fromlist([X[i], y[i]], fields) for i in train_idx], fields=fields) tst = Dataset( examples=[Example.fromlist([X[i], y[i]], fields) for i in test_idx], fields=fields) return trn, tst
def build_examples(self): examples = [] if self.test: # 如果为测试集,则不加载label for text in tqdm(self.data[self.text_field]): examples.append(Example.fromlist([text, None], self.fields)) else: for text, label in tqdm( zip(self.data[self.text_field], self.data[self.label_field])): # Example: Defines a single training or test example. # Stores each column of the example as an attribute. examples.append(Example.fromlist([text, label], self.fields)) return examples
def predict(model, texts, vocabulary, device): src_field = TranslationField() index_field = RawField() examples = [ Example.fromlist([x, i], [('src', src_field), ('index', index_field)]) for i, x in enumerate(texts) ] dataset = Dataset(examples=examples, fields=[('src', src_field), ('index', index_field)]) src_field.vocab = vocabulary iterator = Iterator(dataset=dataset, batch_size=2048, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, repeat=False, shuffle=False) texts = [] indices = [] for data in tqdm(iterator): texts.extend( translate(model=model, vocabulary=vocabulary, data=data, max_seq_len=100, device=device)) indices.extend(data.index) prediction = pd.DataFrame([texts, indices]).T.rename(columns={ 0: 'fullname_prediction', 1: 'index' }) prediction = prediction.sort_values('index') return prediction
def __init__(self, sentences: list, s_postags: list, s_lemmas: list, labels: list, fields: list): super(FrameTargetDataset, self).__init__([ Example.fromlist([tokens, postags, lemmas, label], fields) for tokens, postags, lemmas, label in zip(sentences, s_postags, s_lemmas, labels) ], fields)
def prepareTranslationData(src_path, trg_path, proportions, fields): if not isinstance(fields[0], (tuple, list)): fields = [('src', fields[0]), ('trg', fields[1])] src, trg = readFiles(src_path, trg_path) examples = [Example.fromlist(data=[src_line, trg_line], fields=fields) for src_line, trg_line in zip(src, trg)] train, val, test = splits(examples, train=proportions[0], val=proportions[1], test=proportions[2]) return tuple(MyTranslationDataset(data,fields) for data in (train, val, test) if data is not None)
def lazy_examples(csv_source): with open(csv_source) as f: reader = csv.reader(f) next(reader) for text, title in reader: yield Example.fromlist([text, title], [('text', text_field), ('title', text_field)])
def __init__(self, path, text_field, visual_field, acoustic_field, label_field, **kwargs): """Create an MOSI dataset instance given a path and fields. Arguments: path: Path to the dataset's highest level directory text_field: The field that will be used for text data. visual_field: The field that will be used for visual data. acoustic_field: The field that will be used for acoustic data. label_field: The field that will be used for label data. Remaining keyword arguments: Passed to the constructor of data.Dataset. """ fields = [('text', text_field), ('visual', visual_field), ('acoustic', acoustic_field), ('label', label_field)] examples = [] with open(path, 'rb') as f: data = pickle.load(f) for ex in data: (text, visual, acoustic), label, _ = ex examples.append( Example.fromlist([text, visual, acoustic, label], fields)) super(MOSI, self).__init__(examples, fields, **kwargs)
def init_dataloaders(self): batch_size = self.config.get('batch_size', 8) project_path = self.config['firelab']['project_path'] data_path = os.path.join(project_path, self.config['data']) with open(data_path) as f: lines = f.read().splitlines() text = Field(init_token='<bos>', eos_token='<eos>', batch_first=True) examples = [Example.fromlist([s], [('text', text)]) for s in lines] dataset = Dataset(examples, [('text', text)]) # TODO: torchtext is insane. We pass split ratio for [train, val, test] # and it returns splits for [train, test, val] splits = dataset.split(split_ratio=[0.999, 0.0009, 0.0001]) self.train_ds, self.test_ds, self.val_ds = splits text.build_vocab(self.train_ds) self.vocab = text.vocab self.train_dataloader = data.BucketIterator(self.train_ds, batch_size, repeat=False) self.val_dataloader = data.BucketIterator(self.val_ds, batch_size, train=False, sort=False) self.test_dataloader = data.BucketIterator(self.test_ds, batch_size, train=False, sort=False)
def __init__(self, path: str, fields: Sequence[Tuple[str, Field]], num_samples: Optional[int] = None, add_cls: bool = False, random_state: int = 162, max_len: Optional[int] = None, verbose: bool = True, **kwargs): duplicate_spaces_re = re.compile(r' +') with open(path, 'r', encoding='utf-8') as fp: all_data = [] reader = unicode_csv_reader(fp) for row in reader: cls, text = row[0], row[1] if max_len is not None and len(text.split()) > max_len: continue text = text.replace('\\n\\n', '\\n ') text = duplicate_spaces_re.sub(' ', text) data = (text, text, cls) if add_cls else (text, text) all_data.append(data) if num_samples is not None and num_samples < len(all_data): random.seed(random_state) all_data = random.sample(all_data, num_samples) examples = [] for data in tqdm(all_data, desc='Converting data into examples', disable=not verbose): examples.append(Example.fromlist(data=data, fields=fields)) super().__init__(examples=examples, fields=fields, **kwargs)
def __init__(self, path: str, ext: str, field: Field, **kwargs) -> None: """ Create a monolingual dataset (=only sources) given path and field. :param path: Prefix of path to the data file :param ext: Containing the extension to path for this language. :param field: Containing the fields that will be used for data. :param kwargs: Passed to the constructor of data.Dataset. """ fields = [('src', field)] if hasattr(path, "readline"): # special usage: stdin src_file = path else: src_path = expanduser(path + ext) src_file = open(src_path) examples = [] for src_line in src_file: src_line = src_line.strip() if src_line != '': examples.append(Example.fromlist([src_line], fields)) src_file.close() super(MonoDataset, self).__init__(examples, fields, **kwargs)
def build_examples(datas, fields): examples = [] for data in datas: example = Example.fromlist(data, fields) examples.append(example) return examples
def __init__(self, data_dir): self.data_dir = data_dir fname = 'corpus.json' # fields id_field = Field(sequential=False, unk_token=None) text_field = Field(include_lengths=True) timestep_field = Field(sequential=False, use_vocab=False, unk_token=None) fields = [('id', id_field), ('text', text_field), ('timestep', timestep_field)] # load examples fpath = os.path.join(data_dir, 'corpus.json') print('Loading {}...'.format(fpath)) with open(fpath, 'r') as f: corpus = json.load(f) examples = [ Example.fromlist([ex['id'], ex['text'], ex['timestep']], fields) for ex in corpus ] dataset = Dataset(examples, fields) id_field.build_vocab(dataset) self.examples = examples self.fields = OrderedDict(fields) self.nts = max([ex.timestep for ex in self.examples]) + 1
def prepare_dataset(dataset): context, query, label, start, end = list(zip(*dataset)) dataset = list( zip(context, deepcopy(context), query, deepcopy(query), start, end, label)) TEXT = Field(lower=True, include_lengths=False, batch_first=True) CHAR = RawField() LABEL = Field(sequential=False, tensor_type=torch.LongTensor) examples = [] for i, d in enumerate(dataset): if i % 100 == 0: print('[%d/%d]' % (i, len(dataset))) examples.append( Example.fromlist(d, [('context', TEXT), ('context_c', CHAR), ('query', TEXT), ('query_c', CHAR), ('start', LABEL), ('end', LABEL), ('label', TEXT)])) dataset = Dataset(examples, [('context', TEXT), ('context_c', CHAR), ('query', TEXT), ('query_c', CHAR), ('start', LABEL), ('end', LABEL), ('label', TEXT)]) TEXT.build_vocab(dataset, min_freq=2) #CHAR.build_vocab(dataset) return dataset, TEXT, CHAR
def load_examples(cls, src_path, label_path, fields): texts = [line.rstrip('\n') for line in open(src_path)] labels = [line.rstrip('\n') for line in open(label_path)] examples = [] for t, l in zip(texts, labels): examples.append(Example.fromlist([t, l], fields)) return examples
def tokenize(self, path): assert os.path.exists(path) with open(path, "r", encoding="utf-8") as f: texts, tags = list(), list() examples = list() for line in f: if len(line.split()) > 0: text, tag = line.split() texts.append(text) tags.append(tag) else: assert len(texts) == len(tags) example = Example.fromlist([texts, tags], fields=[('texts', self.TEXT), ('tags', self.TAG)]) examples.append(example) texts, tags = list(), list() dataset = Dataset(examples, fields={ 'texts': self.TEXT, 'tags': self.TAG }) return dataset
def __init__(self, path, fields, tokenizer, label2id): examples = [] data = load_data(path) for (text, arguments) in data: input_ids, token_type_ids = tokenizer.encode(text, max_length=max_length) seq_len = len(input_ids) labels = [0] * seq_len attention_mask = [1] * seq_len for argument in arguments.items(): a_token_ids = tokenizer.encode(argument[0])[0][1:-1] start_index = search(a_token_ids, input_ids) # if start_index != -1: # for i in range(0, len(a_token_ids)): # labels[start_index + i] = label2id[argument[1]] if start_index != -1: labels[start_index] = label2id[argument[1]] for i in range(1, len(a_token_ids)): labels[start_index + i] = label2id[argument[1]] assert len(input_ids) == len(token_type_ids) == len( labels) == seq_len examples.append( Example.fromlist([ input_ids, token_type_ids, attention_mask, labels, seq_len ], fields)) super().__init__(examples, fields)
def transform(self, X, y=None): with warnings.catch_warnings(record=True): fields = [(name, field) for (name, field) in self.fields if name in X] proc = [X[col].apply(f.preprocess) for col, f in fields] examples = [Example.fromlist(f, fields) for f in zip(*proc)] return Dataset(examples, fields)
def read_data(corpus_file, fields, max_len=None): train_id_start = 0 test_id_start = 76049 # let the ids for the test examples start after the training example indices if corpus_file == "wsd_test_blind.txt": print("Loading test data...") id_start = test_id_start else: print("Loading train/val data...") id_start = train_id_start with open(corpus_file, encoding='utf-8') as f: examples = [] for i, line in enumerate(f): sense, lemma, word_position, text = line.split('\t') # We need to convert from the word position to the token position words = text.split() pre_word = " ".join(words[:int(word_position)]) pre_word_tokenized = tokenizer.tokenize(pre_word) token_position = len( pre_word_tokenized ) + 1 # taking into account the later addition of the start token example_id = id_start + i if max_len is None or token_position < max_len - 1: # ignore examples where the relevant token is cut off due to max_len if cls_token: token_position = 0 examples.append( Example.fromlist( [sense, lemma, token_position, text, example_id], fields)) else: print( "Example %d is skipped because the relevant token was cut off (token pos = %d)" % (example_id, token_position)) print(text) return Dataset(examples, fields)
def __init__( self, question_path, paragraph_path, ratio, batch_size, vocab: Vocab = Ref("model.vocab"), batch_first=Ref("model.batch_first", True), ): self.vocab = vocab question = Field(include_lengths=True, batch_first=batch_first, pad_token=vocab.pad_token) question.vocab = vocab paragraph = Field(batch_first=batch_first, pad_token=vocab.pad_token) paragraph.vocab = vocab paragraphs = NestedField(paragraph, include_lengths=True) paragraphs.vocab = vocab target = Field(sequential=False, use_vocab=False, is_target=True) fields = [("question", question), ("paragraphs", paragraphs), ("target", target)] examples = [] with open(paragraph_path) as paragraph_file, open( question_path) as question_file: for q in question_file: q = q.strip() ps = [paragraph_file.readline().strip() for _ in range(ratio)] examples.append(Example.fromlist([q, ps, 0], fields)) BaseIRDataset.__init__(self, ratio, batch_size, batch_first) TorchTextDataset.__init__(self, examples, fields)
def classify(): req_body = json.loads(request.data) test_1_title = req_body['title'] test_1_text = req_body['text'] test_1_titletext = test_1_title + ". " + test_1_text example_item = Example.fromlist( [test_1_title, test_1_text, test_1_titletext], fields) eval_ds = Dataset(examples=[example_item], fields=fields, filter_pred=None) eval_iter = Iterator(eval_ds, batch_size=1, device=device, train=False, shuffle=False, sort=False) best_model.eval() with torch.no_grad(): for (title, text, titletext), _ in eval_iter: titletext = titletext.type(torch.LongTensor) titletext = titletext.to(device) output = best_model(titletext, None) output = output[0] prediction = torch.argmax(output, 1).tolist()[0] return {"classification_result": bool(prediction)}
def split(self, split_ratio=0.7, stratified=False, strata_field='label', random_state=None): if stratified or random_state: raise NotImplemented() text = self.examples[0].text train_len = int(len(text) * split_ratio) fields = ('text', self.fields['text']) train_example = [Example.fromlist([text[0:train_len]], [fields])] test_example = [Example.fromlist([text[train_len:]], [fields])] return Dataset(train_example, self.fields), Dataset(test_example, self.fields)
def load_source_data(filename, field): examples = [] with open(filename) as src_file: for src_line in src_file: src_line = src_line.strip() examples.append(Example.fromlist([src_line], [field])) return examples
def torch_text_from_memory(): tokenize = lambda x: x.split() SENTENCE_FIELD = Field(sequential=True, tokenize=tokenize, pad_token="<unk>") VERB_FORM_FIELD = Field(sequential=False) datafields = [("sentence", SENTENCE_FIELD), ("verb_form", VERB_FORM_FIELD)] # data = [{"sentence": "I king", "verb_form": "am"}, # {"sentence": "You my friend", "verb_form": "were"}, # {"sentence": "They my friend", "verb_form": "are"}, # {"sentence": "We kings", "verb_form": "are"}, # {"sentence": "I have strong", "verb_form": "been"}, # {"sentence": "We enemies", "verb_form": "were"}] data = [( "When the modern Olympics began in 1896, the initiators and organizers looking for a great popularizing event", "were"), ("I king", "am"), ("You my friend", "were"), ("They my friend", "are"), ("We kings", "are"), ("I have strong", "been"), ("We enemies", "were")] examples = [] for d in data: examples.append(Example.fromlist(d, datafields)) training_dataset, validation_dataset, test_dataset = Dataset( examples, datafields).split([0.33, 0.33, 0.33]) SENTENCE_FIELD.build_vocab(training_dataset) VERB_FORM_FIELD.build_vocab(training_dataset) return SENTENCE_FIELD, VERB_FORM_FIELD, training_dataset, validation_dataset, test_dataset
def __init__(self, dataset_fn, top_k=None, min_len=7, add_init_eos=True, **kwargs): if add_init_eos: fields = [('text', Field(sequential=True, use_vocab=True, tokenize=tokenize, init_token='<START>', eos_token='<END>'))] else: fields = [('text', Field(sequential=True, use_vocab=True, tokenize=tokenize))] examples = [] counter = 0 with open(dataset_fn, 'r') as in_file: for line in in_file: if top_k and counter >= top_k: break stripped = line.strip() if len(stripped) < min_len: continue examples.append(Example.fromlist([stripped], fields)) counter += 1 super(AutoencodingDataset, self).__init__(examples, fields, **kwargs) fields[0][1].build_vocab(self)
def classify_from_strings(self, strings: Union[List[str], str]) -> list: """ method that can be used for classifying one or multiple examples with a trained classifier :param strings: a single string or a list of strings representing the pieces of text that should be classified :return: list containing the predictions of the models for the inputted pieces of text """ assert self.has_trained if isinstance(strings, str): strings = [strings] if isinstance(strings, list): strings = [[string] for string in strings] fields = [('text', self._TEXT)] list_of_examples = [Example.fromlist(string, fields) for string in strings] dataset = torchtext.data.Dataset(list_of_examples, fields) data = Iterator(dataset, batch_size=1, device=torch.device("cpu"), sort=False, sort_within_batch=False, repeat=False, shuffle=False) predictions = [] for item in data: x = item.text self.model.to(self.device) self.model = self.model.eval() outputs = self.model([x[0].to(self.device), x[1].to(self.device)]) predictions.extend(outputs.detach().cpu().argmax(1).tolist()) results = [self._label_names[i] for i in predictions] return results
def __init__(self, path, exts, fields, max_len=None, **kwargs): assert len(exts) == len(fields), 'N parallel dataset must match' self.N = len(fields) if not isinstance(fields[0], (tuple, list)): newfields = [('src', fields[0]), ('trg', fields[1])] for i in range(len(exts) - 2): newfields.append(('extra_{}'.format(i), fields[2 + i])) # self.fields = newfields fields = newfields paths = tuple(os.path.expanduser(path + '.' + x) for x in exts) # self.max_len = max_len examples = [] with ExitStack() as stack: files = [ stack.enter_context(open(fname, encoding='utf-8')) for fname in paths ] for i, lines in enumerate(zip(*files)): lines = [line.strip() for line in lines] if not any(line == '' for line in lines): example = Example.fromlist(lines, fields) examples.append(example) # if max_len is None: # examples.append(example) # elif len(example.src) <= max_len and len(example.trg) <= max_len: # examples.append(example) super(ParallelDataset, self).__init__(examples, fields, **kwargs)
def make_dataset(path, fields, _log, name='train', use_prefix=False, use_suffix=False): assert len(fields) in range( 2, 8), 'fields should have between 2 and 7 elements' _log.info('Creating %s dataset', name) reader = read_corpus(path, name=name) if isinstance(path, str) else path examples = [] for id_, tagged_sent in enumerate(reader.tagged_sents()): words, tags = zip(*tagged_sent) data = [words, tags] if use_prefix: prefs_2 = [w[:2] for w in words] prefs_3 = [w[:3] for w in words] data.extend([prefs_2, prefs_3]) if use_suffix: suffs_2 = [w[-2:] for w in words] suffs_3 = [w[-3:] for w in words] data.extend([suffs_2, suffs_3]) if len(fields) in (3, 5, 7): data.append(id_) examples.append(Example.fromlist(data, fields)) return Dataset(examples, fields)
def convert_to_dataset(data, kor, eng): """ Pre-process input DataFrame and convert pandas DataFrame to torchtext Dataset. Args: data: (DataFrame) pandas DataFrame to be converted into torchtext Dataset kor: torchtext Field containing Korean sentence eng: torchtext Field containing English sentence Returns: (Dataset) torchtext Dataset containing 'kor' and 'eng' Fields """ # drop missing values not containing str value from DataFrame missing_rows = [ idx for idx, row in data.iterrows() if type(row.korean) != str or type(row.english) != str ] data = data.drop(missing_rows) # convert each row of DataFrame to torchtext 'Example' containing 'kor' and 'eng' Fields list_of_examples = [ Example.fromlist(row.apply(lambda x: clean_text(x)).tolist(), fields=[('kor', kor), ('eng', eng)]) for _, row in data.iterrows() ] # construct torchtext 'Dataset' using torchtext 'Example' list dataset = Dataset(examples=list_of_examples, fields=[('kor', kor), ('eng', eng)]) return dataset
def __init__(self,annFile,text_field,transform=None): from pycocotools.coco import COCO coco = COCO(annFile) ids = list(coco.imgs.keys()) transform = transform field = [("text",text_field)] examples = [] max_seq_len = 0 for i in ids: ann_ids = coco.getAnnIds(imgIds=i) anns = coco.loadAnns(ann_ids) for ann in anns: caption = ann['caption'] if transform is not None: caption = transform(caption) if len(caption) > max_seq_len: max_seq_len = len(caption) examples.append(Example.fromlist([caption],field)) self.max_seq_len = max_seq_len + 2 # one for <sos> and one for <eos> super().__init__(examples=examples,fields=field)