def main():
    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-data_pkl',
                        required=True,
                        help='Pickle file with vocabulary.')
    parser.add_argument('-trg_data', default='PSLG-PC12/ENG-ASL_Test.en')
    parser.add_argument('-pred_data',
                        default='predictions.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    opt = parser.parse_args()

    data = pickle.load(open(opt.data_pkl, 'rb'))
    SRC, TRG = data['vocab']['src'], data['vocab']['trg']

    fields = [('src', SRC)]

    with open(opt.trg_data, 'r') as f:
        trg_loader = Dataset(
            examples=[Example.fromlist([x], fields) for x in f],
            fields={'src': SRC})
    trg_txt = [x.src for x in trg_loader]

    with open(opt.pred_data, 'r') as f:
        pred_loader = Dataset(
            examples=[Example.fromlist([x], fields) for x in f],
            fields={'src': SRC})
    pred_txt = [[x.src] for x in pred_loader]

    score = bleu_score(trg_txt, pred_txt)
    print('Bleu 4 score is {}'.format(str(score)))

    with open('bleu_score.txt', 'w') as f:
        f.write('Bleu 4 score is {}'.format(str(score)))
Example #2
0
class TestSimpleIterator(object):
    TEXT = Field()
    examples = [
        Example.fromlist(['John loves Mary'], [('text', TEXT)]),
        Example.fromlist(['Mary cries'], [('text', TEXT)]),
    ]
    dataset = Dataset(examples, [('text', TEXT)])
    TEXT.build_vocab(dataset)

    def make_iterator(self):
        return SimpleIterator(self.dataset, device=-1)

    def test_init_minimal(self):
        iterator = SimpleIterator(self.dataset)
        assert iterator.dataset is self.dataset
        assert iterator.batch_size == 1
        assert iterator.train
        assert iterator.device is None
        assert iterator.sort_key is None
        assert not iterator.sort
        assert not iterator.repeat
        assert iterator.shuffle == iterator.train
        assert not iterator.sort_within_batch

    def test_init_full(self):
        iterator = SimpleIterator(self.dataset, train=False, device=-1)
        assert not iterator.train
        assert iterator.device == -1

    def test_next(self):
        iterator = self.make_iterator()
        sample = next(iter(iterator))

        assert isinstance(sample.text, Variable)
        assert sample.text.size(1) == 1
Example #3
0
def stratified_sampler(train, test, target, text_field, label_field):
    shuffler = StratifiedShuffleSplit(n_splits=1,
                                      train_size=0.7,
                                      test_size=0.30)
    X = []
    y = []
    fields = [('text', text_field), (target[0], label_field)]

    for example in train:
        X.append(getattr(example, "text"))
        y.append(getattr(example, target[0]))

    for example in test:
        X.append(getattr(example, "text"))
        y.append(getattr(example, target[0]))

    train_idx, test_idx = list(shuffler.split(X, y))[0]

    trn = Dataset(
        examples=[Example.fromlist([X[i], y[i]], fields) for i in train_idx],
        fields=fields)
    tst = Dataset(
        examples=[Example.fromlist([X[i], y[i]], fields) for i in test_idx],
        fields=fields)

    return trn, tst
Example #4
0
 def build_examples(self):
     examples = []
     if self.test:
         # 如果为测试集,则不加载label
         for text in tqdm(self.data[self.text_field]):
             examples.append(Example.fromlist([text, None], self.fields))
     else:
         for text, label in tqdm(
                 zip(self.data[self.text_field],
                     self.data[self.label_field])):
             # Example: Defines a single training or test example.
             # Stores each column of the example as an attribute.
             examples.append(Example.fromlist([text, label], self.fields))
     return examples
Example #5
0
def predict(model, texts, vocabulary, device):
    src_field = TranslationField()
    index_field = RawField()
    examples = [
        Example.fromlist([x, i], [('src', src_field), ('index', index_field)])
        for i, x in enumerate(texts)
    ]
    dataset = Dataset(examples=examples,
                      fields=[('src', src_field), ('index', index_field)])
    src_field.vocab = vocabulary
    iterator = Iterator(dataset=dataset,
                        batch_size=2048,
                        sort=False,
                        sort_within_batch=True,
                        sort_key=lambda x: len(x.src),
                        device=device,
                        repeat=False,
                        shuffle=False)

    texts = []
    indices = []
    for data in tqdm(iterator):
        texts.extend(
            translate(model=model,
                      vocabulary=vocabulary,
                      data=data,
                      max_seq_len=100,
                      device=device))
        indices.extend(data.index)
    prediction = pd.DataFrame([texts, indices]).T.rename(columns={
        0: 'fullname_prediction',
        1: 'index'
    })
    prediction = prediction.sort_values('index')
    return prediction
Example #6
0
 def __init__(self, sentences: list, s_postags: list, s_lemmas: list,
              labels: list, fields: list):
     super(FrameTargetDataset, self).__init__([
         Example.fromlist([tokens, postags, lemmas, label], fields)
         for tokens, postags, lemmas, label in zip(sentences, s_postags,
                                                   s_lemmas, labels)
     ], fields)
Example #7
0
def prepareTranslationData(src_path, trg_path, proportions, fields):
    if not isinstance(fields[0], (tuple, list)):
            fields = [('src', fields[0]), ('trg', fields[1])]
    src, trg = readFiles(src_path, trg_path)
    examples = [Example.fromlist(data=[src_line, trg_line], fields=fields) for src_line, trg_line in zip(src, trg)]
    train, val, test = splits(examples, train=proportions[0], val=proportions[1], test=proportions[2])
    return tuple(MyTranslationDataset(data,fields) for data in (train, val, test) if data is not None)
Example #8
0
def lazy_examples(csv_source):
    with open(csv_source) as f:
        reader = csv.reader(f)
        next(reader)
        for text, title in reader:
            yield Example.fromlist([text, title], [('text', text_field),
                                                   ('title', text_field)])
Example #9
0
    def __init__(self, path, text_field, visual_field, acoustic_field,
                 label_field, **kwargs):
        """Create an MOSI dataset instance given a path and fields.

        Arguments:
            path: Path to the dataset's highest level directory
            text_field: The field that will be used for text data.
            visual_field: The field that will be used for visual data.
            acoustic_field: The field that will be used for acoustic data.
            label_field: The field that will be used for label data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        fields = [('text', text_field), ('visual', visual_field),
                  ('acoustic', acoustic_field), ('label', label_field)]
        examples = []

        with open(path, 'rb') as f:
            data = pickle.load(f)
        for ex in data:
            (text, visual, acoustic), label, _ = ex
            examples.append(
                Example.fromlist([text, visual, acoustic, label], fields))

        super(MOSI, self).__init__(examples, fields, **kwargs)
Example #10
0
    def init_dataloaders(self):
        batch_size = self.config.get('batch_size', 8)
        project_path = self.config['firelab']['project_path']
        data_path = os.path.join(project_path, self.config['data'])

        with open(data_path) as f:
            lines = f.read().splitlines()

        text = Field(init_token='<bos>', eos_token='<eos>', batch_first=True)

        examples = [Example.fromlist([s], [('text', text)]) for s in lines]
        dataset = Dataset(examples, [('text', text)])
        # TODO: torchtext is insane. We pass split ratio for [train, val, test]
        # and it returns splits for [train, test, val]
        splits = dataset.split(split_ratio=[0.999, 0.0009, 0.0001])
        self.train_ds, self.test_ds, self.val_ds = splits
        text.build_vocab(self.train_ds)

        self.vocab = text.vocab
        self.train_dataloader = data.BucketIterator(self.train_ds,
                                                    batch_size,
                                                    repeat=False)
        self.val_dataloader = data.BucketIterator(self.val_ds,
                                                  batch_size,
                                                  train=False,
                                                  sort=False)
        self.test_dataloader = data.BucketIterator(self.test_ds,
                                                   batch_size,
                                                   train=False,
                                                   sort=False)
Example #11
0
 def __init__(self,
              path: str,
              fields: Sequence[Tuple[str, Field]],
              num_samples: Optional[int] = None,
              add_cls: bool = False,
              random_state: int = 162,
              max_len: Optional[int] = None,
              verbose: bool = True,
              **kwargs):
     duplicate_spaces_re = re.compile(r' +')
     with open(path, 'r', encoding='utf-8') as fp:
         all_data = []
         reader = unicode_csv_reader(fp)
         for row in reader:
             cls, text = row[0], row[1]
             if max_len is not None and len(text.split()) > max_len:
                 continue
             text = text.replace('\\n\\n', '\\n ')
             text = duplicate_spaces_re.sub(' ', text)
             data = (text, text, cls) if add_cls else (text, text)
             all_data.append(data)
     if num_samples is not None and num_samples < len(all_data):
         random.seed(random_state)
         all_data = random.sample(all_data, num_samples)
     examples = []
     for data in tqdm(all_data,
                      desc='Converting data into examples',
                      disable=not verbose):
         examples.append(Example.fromlist(data=data, fields=fields))
     super().__init__(examples=examples, fields=fields, **kwargs)
Example #12
0
    def __init__(self, path: str, ext: str, field: Field, **kwargs) -> None:
        """
        Create a monolingual dataset (=only sources) given path and field.

        :param path: Prefix of path to the data file
        :param ext: Containing the extension to path for this language.
        :param field: Containing the fields that will be used for data.
        :param kwargs: Passed to the constructor of data.Dataset.
        """

        fields = [('src', field)]

        if hasattr(path, "readline"):  # special usage: stdin
            src_file = path
        else:
            src_path = expanduser(path + ext)
            src_file = open(src_path)

        examples = []
        for src_line in src_file:
            src_line = src_line.strip()
            if src_line != '':
                examples.append(Example.fromlist([src_line], fields))

        src_file.close()

        super(MonoDataset, self).__init__(examples, fields, **kwargs)
Example #13
0
def build_examples(datas, fields):
    examples = []
    for data in datas:
        example = Example.fromlist(data, fields)
        examples.append(example)

    return examples
Example #14
0
 def __init__(self, data_dir):
     self.data_dir = data_dir
     fname = 'corpus.json'
     # fields
     id_field = Field(sequential=False, unk_token=None)
     text_field = Field(include_lengths=True)
     timestep_field = Field(sequential=False,
                            use_vocab=False,
                            unk_token=None)
     fields = [('id', id_field), ('text', text_field),
               ('timestep', timestep_field)]
     # load examples
     fpath = os.path.join(data_dir, 'corpus.json')
     print('Loading {}...'.format(fpath))
     with open(fpath, 'r') as f:
         corpus = json.load(f)
     examples = [
         Example.fromlist([ex['id'], ex['text'], ex['timestep']], fields)
         for ex in corpus
     ]
     dataset = Dataset(examples, fields)
     id_field.build_vocab(dataset)
     self.examples = examples
     self.fields = OrderedDict(fields)
     self.nts = max([ex.timestep for ex in self.examples]) + 1
Example #15
0
def prepare_dataset(dataset):
    context, query, label, start, end = list(zip(*dataset))

    dataset = list(
        zip(context, deepcopy(context), query, deepcopy(query), start, end,
            label))
    TEXT = Field(lower=True, include_lengths=False, batch_first=True)
    CHAR = RawField()
    LABEL = Field(sequential=False, tensor_type=torch.LongTensor)

    examples = []
    for i, d in enumerate(dataset):
        if i % 100 == 0: print('[%d/%d]' % (i, len(dataset)))
        examples.append(
            Example.fromlist(d, [('context', TEXT), ('context_c', CHAR),
                                 ('query', TEXT), ('query_c', CHAR),
                                 ('start', LABEL), ('end', LABEL),
                                 ('label', TEXT)]))

    dataset = Dataset(examples, [('context', TEXT), ('context_c', CHAR),
                                 ('query', TEXT), ('query_c', CHAR),
                                 ('start', LABEL), ('end', LABEL),
                                 ('label', TEXT)])
    TEXT.build_vocab(dataset, min_freq=2)
    #CHAR.build_vocab(dataset)

    return dataset, TEXT, CHAR
Example #16
0
 def load_examples(cls, src_path, label_path, fields):
     texts = [line.rstrip('\n') for line in open(src_path)]
     labels = [line.rstrip('\n') for line in open(label_path)]
     examples = []
     for t, l in zip(texts, labels):
         examples.append(Example.fromlist([t, l], fields))
     return examples
    def tokenize(self, path):
        assert os.path.exists(path)

        with open(path, "r", encoding="utf-8") as f:
            texts, tags = list(), list()
            examples = list()

            for line in f:
                if len(line.split()) > 0:
                    text, tag = line.split()
                    texts.append(text)
                    tags.append(tag)

                else:
                    assert len(texts) == len(tags)
                    example = Example.fromlist([texts, tags],
                                               fields=[('texts', self.TEXT),
                                                       ('tags', self.TAG)])
                    examples.append(example)
                    texts, tags = list(), list()

        dataset = Dataset(examples,
                          fields={
                              'texts': self.TEXT,
                              'tags': self.TAG
                          })
        return dataset
Example #18
0
    def __init__(self, path, fields, tokenizer, label2id):
        examples = []
        data = load_data(path)
        for (text, arguments) in data:
            input_ids, token_type_ids = tokenizer.encode(text,
                                                         max_length=max_length)

            seq_len = len(input_ids)
            labels = [0] * seq_len
            attention_mask = [1] * seq_len
            for argument in arguments.items():
                a_token_ids = tokenizer.encode(argument[0])[0][1:-1]
                start_index = search(a_token_ids, input_ids)
                # if start_index != -1:
                #     for i in range(0, len(a_token_ids)):
                #         labels[start_index + i] = label2id[argument[1]]
                if start_index != -1:
                    labels[start_index] = label2id[argument[1]]
                    for i in range(1, len(a_token_ids)):
                        labels[start_index + i] = label2id[argument[1]]

            assert len(input_ids) == len(token_type_ids) == len(
                labels) == seq_len

            examples.append(
                Example.fromlist([
                    input_ids, token_type_ids, attention_mask, labels, seq_len
                ], fields))
        super().__init__(examples, fields)
Example #19
0
 def transform(self, X, y=None):
     with warnings.catch_warnings(record=True):
         fields = [(name, field) for (name, field) in self.fields
                   if name in X]
         proc = [X[col].apply(f.preprocess) for col, f in fields]
         examples = [Example.fromlist(f, fields) for f in zip(*proc)]
         return Dataset(examples, fields)
Example #20
0
 def read_data(corpus_file, fields, max_len=None):
     train_id_start = 0
     test_id_start = 76049  # let the ids for the test examples start after the training example indices
     if corpus_file == "wsd_test_blind.txt":
         print("Loading test data...")
         id_start = test_id_start
     else:
         print("Loading train/val data...")
         id_start = train_id_start
     with open(corpus_file, encoding='utf-8') as f:
         examples = []
         for i, line in enumerate(f):
             sense, lemma, word_position, text = line.split('\t')
             # We need to convert from the word position to the token position
             words = text.split()
             pre_word = " ".join(words[:int(word_position)])
             pre_word_tokenized = tokenizer.tokenize(pre_word)
             token_position = len(
                 pre_word_tokenized
             ) + 1  # taking into account the later addition of the start token
             example_id = id_start + i
             if max_len is None or token_position < max_len - 1:  # ignore examples where the relevant token is cut off due to max_len
                 if cls_token:
                     token_position = 0
                 examples.append(
                     Example.fromlist(
                         [sense, lemma, token_position, text, example_id],
                         fields))
             else:
                 print(
                     "Example %d is skipped because the relevant token was cut off (token pos = %d)"
                     % (example_id, token_position))
                 print(text)
     return Dataset(examples, fields)
Example #21
0
    def __init__(
            self,
            question_path,
            paragraph_path,
            ratio,
            batch_size,
            vocab: Vocab = Ref("model.vocab"),
            batch_first=Ref("model.batch_first", True),
    ):
        self.vocab = vocab
        question = Field(include_lengths=True,
                         batch_first=batch_first,
                         pad_token=vocab.pad_token)
        question.vocab = vocab
        paragraph = Field(batch_first=batch_first, pad_token=vocab.pad_token)
        paragraph.vocab = vocab
        paragraphs = NestedField(paragraph, include_lengths=True)
        paragraphs.vocab = vocab
        target = Field(sequential=False, use_vocab=False, is_target=True)

        fields = [("question", question), ("paragraphs", paragraphs),
                  ("target", target)]
        examples = []
        with open(paragraph_path) as paragraph_file, open(
                question_path) as question_file:
            for q in question_file:
                q = q.strip()
                ps = [paragraph_file.readline().strip() for _ in range(ratio)]
                examples.append(Example.fromlist([q, ps, 0], fields))

        BaseIRDataset.__init__(self, ratio, batch_size, batch_first)
        TorchTextDataset.__init__(self, examples, fields)
def classify():
    req_body = json.loads(request.data)
    test_1_title = req_body['title']
    test_1_text = req_body['text']
    test_1_titletext = test_1_title + ". " + test_1_text

    example_item = Example.fromlist(
        [test_1_title, test_1_text, test_1_titletext], fields)
    eval_ds = Dataset(examples=[example_item], fields=fields, filter_pred=None)
    eval_iter = Iterator(eval_ds,
                         batch_size=1,
                         device=device,
                         train=False,
                         shuffle=False,
                         sort=False)

    best_model.eval()
    with torch.no_grad():
        for (title, text, titletext), _ in eval_iter:
            titletext = titletext.type(torch.LongTensor)
            titletext = titletext.to(device)
            output = best_model(titletext, None)
            output = output[0]

            prediction = torch.argmax(output, 1).tolist()[0]

    return {"classification_result": bool(prediction)}
Example #23
0
    def split(self,
              split_ratio=0.7,
              stratified=False,
              strata_field='label',
              random_state=None):
        if stratified or random_state:
            raise NotImplemented()

        text = self.examples[0].text
        train_len = int(len(text) * split_ratio)
        fields = ('text', self.fields['text'])
        train_example = [Example.fromlist([text[0:train_len]], [fields])]
        test_example = [Example.fromlist([text[train_len:]], [fields])]

        return Dataset(train_example,
                       self.fields), Dataset(test_example, self.fields)
Example #24
0
 def load_source_data(filename, field):
     examples = []
     with open(filename) as src_file:
         for src_line in src_file:
             src_line = src_line.strip()
             examples.append(Example.fromlist([src_line], [field]))
     return examples
Example #25
0
def torch_text_from_memory():
    tokenize = lambda x: x.split()
    SENTENCE_FIELD = Field(sequential=True,
                           tokenize=tokenize,
                           pad_token="<unk>")
    VERB_FORM_FIELD = Field(sequential=False)
    datafields = [("sentence", SENTENCE_FIELD), ("verb_form", VERB_FORM_FIELD)]

    # data = [{"sentence": "I king", "verb_form": "am"},
    #        {"sentence": "You my friend", "verb_form": "were"},
    #        {"sentence": "They my friend", "verb_form": "are"},
    #        {"sentence": "We kings", "verb_form": "are"},
    #        {"sentence": "I have strong", "verb_form": "been"},
    #        {"sentence": "We enemies", "verb_form": "were"}]
    data = [(
        "When the modern Olympics began in 1896, the initiators and organizers looking for a great popularizing event",
        "were"), ("I king", "am"), ("You my friend", "were"),
            ("They my friend", "are"), ("We kings", "are"),
            ("I have strong", "been"), ("We enemies", "were")]
    examples = []
    for d in data:
        examples.append(Example.fromlist(d, datafields))

    training_dataset, validation_dataset, test_dataset = Dataset(
        examples, datafields).split([0.33, 0.33, 0.33])
    SENTENCE_FIELD.build_vocab(training_dataset)
    VERB_FORM_FIELD.build_vocab(training_dataset)
    return SENTENCE_FIELD, VERB_FORM_FIELD, training_dataset, validation_dataset, test_dataset
Example #26
0
 def __init__(self,
              dataset_fn,
              top_k=None,
              min_len=7,
              add_init_eos=True,
              **kwargs):
     if add_init_eos:
         fields = [('text',
                    Field(sequential=True,
                          use_vocab=True,
                          tokenize=tokenize,
                          init_token='<START>',
                          eos_token='<END>'))]
     else:
         fields = [('text',
                    Field(sequential=True,
                          use_vocab=True,
                          tokenize=tokenize))]
     examples = []
     counter = 0
     with open(dataset_fn, 'r') as in_file:
         for line in in_file:
             if top_k and counter >= top_k:
                 break
             stripped = line.strip()
             if len(stripped) < min_len:
                 continue
             examples.append(Example.fromlist([stripped], fields))
             counter += 1
     super(AutoencodingDataset, self).__init__(examples, fields, **kwargs)
     fields[0][1].build_vocab(self)
Example #27
0
    def classify_from_strings(self, strings: Union[List[str], str]) -> list:
        """

        method that can be used for classifying one or multiple examples with a trained classifier

        :param strings: a single string or a list of strings representing the pieces of text that should be classified
        :return: list containing the predictions of the models for the inputted pieces of text
        """
        assert self.has_trained
        if isinstance(strings, str):
            strings = [strings]
        if isinstance(strings, list):
            strings = [[string] for string in strings]

        fields = [('text', self._TEXT)]

        list_of_examples = [Example.fromlist(string, fields) for string in strings]
        dataset = torchtext.data.Dataset(list_of_examples, fields)

        data = Iterator(dataset, batch_size=1, device=torch.device("cpu"), sort=False, sort_within_batch=False,
                        repeat=False, shuffle=False)

        predictions = []

        for item in data:
            x = item.text
            self.model.to(self.device)
            self.model = self.model.eval()
            outputs = self.model([x[0].to(self.device), x[1].to(self.device)])
            predictions.extend(outputs.detach().cpu().argmax(1).tolist())
        results = [self._label_names[i] for i in predictions]
        return results
Example #28
0
    def __init__(self, path, exts, fields, max_len=None, **kwargs):
        assert len(exts) == len(fields), 'N parallel dataset must match'
        self.N = len(fields)

        if not isinstance(fields[0], (tuple, list)):
            newfields = [('src', fields[0]), ('trg', fields[1])]
            for i in range(len(exts) - 2):
                newfields.append(('extra_{}'.format(i), fields[2 + i]))
            # self.fields = newfields
            fields = newfields

        paths = tuple(os.path.expanduser(path + '.' + x) for x in exts)
        # self.max_len = max_len
        examples = []

        with ExitStack() as stack:
            files = [
                stack.enter_context(open(fname, encoding='utf-8'))
                for fname in paths
            ]
            for i, lines in enumerate(zip(*files)):
                lines = [line.strip() for line in lines]
                if not any(line == '' for line in lines):
                    example = Example.fromlist(lines, fields)
                    examples.append(example)
                    # if max_len is None:
                    #     examples.append(example)
                    # elif len(example.src) <= max_len and len(example.trg) <= max_len:
                    #     examples.append(example)
        super(ParallelDataset, self).__init__(examples, fields, **kwargs)
Example #29
0
def make_dataset(path,
                 fields,
                 _log,
                 name='train',
                 use_prefix=False,
                 use_suffix=False):
    assert len(fields) in range(
        2, 8), 'fields should have between 2 and 7 elements'

    _log.info('Creating %s dataset', name)
    reader = read_corpus(path, name=name) if isinstance(path, str) else path
    examples = []
    for id_, tagged_sent in enumerate(reader.tagged_sents()):
        words, tags = zip(*tagged_sent)
        data = [words, tags]
        if use_prefix:
            prefs_2 = [w[:2] for w in words]
            prefs_3 = [w[:3] for w in words]
            data.extend([prefs_2, prefs_3])
        if use_suffix:
            suffs_2 = [w[-2:] for w in words]
            suffs_3 = [w[-3:] for w in words]
            data.extend([suffs_2, suffs_3])
        if len(fields) in (3, 5, 7):
            data.append(id_)
        examples.append(Example.fromlist(data, fields))
    return Dataset(examples, fields)
Example #30
0
def convert_to_dataset(data, kor, eng):
    """
    Pre-process input DataFrame and convert pandas DataFrame to torchtext Dataset.
    Args:
        data: (DataFrame) pandas DataFrame to be converted into torchtext Dataset
        kor: torchtext Field containing Korean sentence
        eng: torchtext Field containing English sentence

    Returns:
        (Dataset) torchtext Dataset containing 'kor' and 'eng' Fields
    """
    # drop missing values not containing str value from DataFrame
    missing_rows = [
        idx for idx, row in data.iterrows()
        if type(row.korean) != str or type(row.english) != str
    ]
    data = data.drop(missing_rows)

    # convert each row of DataFrame to torchtext 'Example' containing 'kor' and 'eng' Fields
    list_of_examples = [
        Example.fromlist(row.apply(lambda x: clean_text(x)).tolist(),
                         fields=[('kor', kor), ('eng', eng)])
        for _, row in data.iterrows()
    ]

    # construct torchtext 'Dataset' using torchtext 'Example' list
    dataset = Dataset(examples=list_of_examples,
                      fields=[('kor', kor), ('eng', eng)])

    return dataset
	def __init__(self,annFile,text_field,transform=None):
		from pycocotools.coco import COCO
		coco = COCO(annFile)
		ids = list(coco.imgs.keys())
		transform = transform
		field = [("text",text_field)]
		examples = []
		max_seq_len = 0
		for i in ids:
			ann_ids = coco.getAnnIds(imgIds=i)
			anns = coco.loadAnns(ann_ids)
			for ann in anns:
				caption = ann['caption']
				if transform is not None:
					caption = transform(caption)
				if len(caption) > max_seq_len:
					max_seq_len = len(caption)
				examples.append(Example.fromlist([caption],field))
		self.max_seq_len = max_seq_len + 2 # one for <sos> and one for <eos>
		super().__init__(examples=examples,fields=field)