Ejemplo n.º 1
0
    def create_iter(self, batch_size):
        """
        构建迭代器
        :param batch_size: 每批的大小
        :return: iter
        """
        # 定义torchtext中的Field
        fields = [('english', self.english), ('chinese', self.chinese)]
        examples = []
        # 构建中英文example
        for en, ch in zip(self.english_list, self.chinese_list):
            item = [en, ch]
            examples.append(data.Example().fromlist(item, fields))
        # 划分训练集,测试集
        train, test = Dataset(examples=examples,
                              fields=fields).split(split_ratio=0.8)
        self.english.build_vocab(train)
        self.chinese.build_vocab(train)
        self.english_voca_size = len(self.english.vocab)
        self.chinese_voca_size = len(self.chinese.vocab)
        train_iter, test_iter = Iterator.splits(
            (train, test),
            batch_sizes=(batch_size, len(test)),
            sort_key=lambda x: len(x.english),
            sort_within_batch=True,
            device=-1)

        return train_iter, test_iter
Ejemplo n.º 2
0
def load_dataset(data_path, train_batch_size=4096, dev_batch_size=1, max_len=100):
    """
    This assumes that the data is already pre-processed using Moses Tokenizer
    Returns iterators for the training/dev dataset

    Arguments:
        data_path: path of the dataset
        train_batch_size: batch size of the training data (defined in terms of number of tokens or sentences, depending on the model_type)
        dev_batch_size: batch size of the dev data (usually one)
        max_len: max length of sequeences in a batch
    """

    SRC = Field(tokenize=lambda s: s.split(), init_token="<s>",
                eos_token="</s>", batch_first=True, include_lengths=True)
    TRG = Field(tokenize=lambda s: s.split(), init_token="<s>",
                eos_token="</s>", batch_first=True, include_lengths=True)

    # create a TranslationDataset for both the train and dev set
    train_data = datasets.TranslationDataset(exts=("train.de", "train.en"), fields=(
        SRC, TRG), path=data_path, filter_pred=lambda x: len(vars(x)['src']) <= max_len and len(vars(x)['trg']) <= max_len)

    dev_data = datasets.TranslationDataset(
        exts=("dev.de", "dev.en"), fields=(SRC, TRG), path=data_path)

    # load in the Test Set
    test_examples = []
    with open(data_path + "test.de", "r") as f:
        for test_example in f.readlines():
            example = data.Example()
            setattr(example, "src", test_example.split())
            test_examples.append(example)

    test_data = data.Dataset(test_examples, fields=[("src", SRC)])

    # build he vocab using the training data
    SRC.build_vocab(train_data.src, train_data.trg)
    TRG.build_vocab(train_data.src, train_data.trg)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # use custom DataIterator in order to minimize padding in a sequence
    # and inoder to `pack` a batch fully inorder to maximmize the computation
    # in a GPU
    train_iterator = DataIterator(train_data, batch_size=train_batch_size, device=device,
                                  repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                                  batch_size_fn=batch_size_fn, train=True, sort_within_batch=True, shuffle=True)

    # use a regular Iterator since we want to be able to compare
    # our translations to a gold standard file. If we use a
    # `DataIterator` then we will get our translations in shuffled/random
    # order
    dev_iterator = Iterator(dev_data, batch_size=dev_batch_size,
                            train=False, sort=False, repeat=False, device=device)

    # create Test Iterator for the test data
    test_iterator = Iterator(
        test_data, batch_size=1, train=False, sort=False, repeat=False, device=device)

    print(len(test_iterator))
    return train_iterator, dev_iterator, test_iterator, SRC, TRG
Ejemplo n.º 3
0
def product_dataset(path, text_field, label_field):
    fields = [('text', text_field), ('label', label_field)]
    texts, labels = read_product(path, 1, 1)
    examples = []
    for text, label in zip(texts, labels):
        _data = [text, label]
        _fields = [('text', text_field), ('label', label_field)]
        a = data.Example()
        examples.append(a.fromlist(_data, _fields))
    ret_data = data.Dataset(examples, fields)
    ret_data.sort_key = lambda x: -1 * len(x.text)
    return ret_data
Ejemplo n.º 4
0
 def create_iter(self, split_ratio, batch_size=1000, device=-1):
     fields = [("text", self.TEXT), ("label", self.LABEL)]
     examples = []
     for review, label in zip(self.reviews, self.labels):
         item = [review, label]
         examples.append(data.Example().fromlist(item, fields))
     train, valid, test = Dataset(examples=examples, fields=fields).split(split_ratio=split_ratio)
     self.TEXT.build_vocab(train)
     self.LABEL.build_vocab(train)
     voca_size = len(self.TEXT.vocab)
     train_iter, val_iter, test_iter = data.Iterator.splits(
         (train, valid, test), sort_key=lambda x: len(x.text),
         batch_sizes=(batch_size, len(valid), len(test)), device=device)
     return train_iter, val_iter, test_iter, voca_size
Ejemplo n.º 5
0
 def read(self, path, text_field, label_field):
     fields = [('text', text_field), ('label', label_field)]
     texts = self.read_dataset(path)
     examples = []
     for text in texts:
         if len(text) < 3:
             continue
         #_data = [text[:-1], text[1:]]
         _data = [text, text]
         _fields = [('text', text_field), ('label', label_field)]
         a = data.Example()
         #print _data
         examples.append(a.fromlist(_data, _fields))
     ret_data = data.Dataset(examples, fields)
     ret_data.sort_key = lambda x: -1 * len(x.text)
     return ret_data
Ejemplo n.º 6
0
    def tokenize(self, text, output='conllu'):
        text = [t for t in text.split("\n") if len(t) > 0]
        examples = [data.Example().fromlist([t], fields=[('text', self.TEXT)]) for t in text]
        dataset = data.Dataset(examples, fields=[('text', self.TEXT)])
        data_iter = data.BucketIterator(dataset, 
            batch_size=self.BATCH_SIZE,
            sort_key=lambda x: len(x.text),
            sort_within_batch=True,
            shuffle=False,
            device=device)
        
        with torch.no_grad():
            preds = []
            for batch in data_iter:
                t, l = batch.text
                predictions = self.model(t, l)           
                predictions = predictions.float()
                _, rounded_preds = torch.max(torch.sigmoid(predictions), 2)
                preds.append(rounded_preds)

        sents = []
        tokens = []
        for item in list(zip(text, preds[::-1])):
            text = item[0]
            tags = item[1]
            token = ''
            for i in tqdm(range(len(tags[0]))):
                if int(tags[0][i]) == 0:
                    token += text[i]
                elif int(tags[0][i]) == 1:
                    token += text[i]
                    if output == 'conllu':
                        space_after = 1 if text[i + 1] == ' ' else 0
                        tokens.append((token.strip(), space_after))
                    else:
                        tokens.append(token.strip())
                    token = ''
                else:
                    token += text[i]
                    if output == 'conllu':
                        tokens.append((token.strip(), 0))
                    else:
                        tokens.append(token.strip())
                    token = ''
                    sents.append(tokens)
                    tokens = []
        return sents
Ejemplo n.º 7
0
def imdb_dataset(path, text_field, label_field):
    fields = [('text', text_field), ('label', label_field)]
    texts = read_imdb(path, 'pos', 1, 1)
    labels = [1] * len(texts)
    _texts = read_imdb(path, 'neg', 1, 1)
    labels += [0] * len(_texts)
    texts += _texts

    examples = []
    for text, label in zip(texts, labels):
        _data = [text, label]
        _fields = [('text', text_field), ('label', label_field)]
        a = data.Example()
        examples.append(a.fromlist(_data, _fields))
    ret_data = data.Dataset(examples, fields)
    ret_data.sort_key = lambda x: -1 * len(x.text)
    return ret_data
Ejemplo n.º 8
0
 def create_iter(self, split_ratio, batch_size=100):
     fields = [("text", self.TEXT), ("label", self.LABEL)]
     examples = []
     for index, context in enumerate(self.file):
         d = context.split('\t')
         # item = [text, label]
         item = [d[1], d[0].strip()]
         examples.append(data.Example().fromlist(item, fields))
     train, valid, test = Dataset(
         examples=examples, fields=fields).split(split_ratio=split_ratio)
     self.TEXT.build_vocab(train)
     self.LABEL.build_vocab(train)
     voca_size = len(self.TEXT.vocab)
     train_iter, val_iter, test_iter = data.Iterator.splits(
         (train, valid, test),
         sort_key=lambda x: len(x.text),
         batch_sizes=(batch_size, len(valid), len(test)))
     return train_iter, val_iter, test_iter, voca_size
def load_pickle(PATH, FIELDNAMES, FIELD):
    dataList = []
    with open(PATH, "rb") as input_file:
        while True:
            try:
                # Taking the dictionary instance as the input Instance
                inputInstance = pickle.load(input_file)
                # plugging it into the list
                dataInstance = [
                    inputInstance[FIELDNAMES[0]], inputInstance[FIELDNAMES[1]]
                ]
                # Finally creating an example objects list
                dataList.append(data.Example().fromlist(dataInstance,
                                                        fields=FIELD))
            except EOFError:
                break

    # At last creating a data Set Object
    exampleListObject = data.Dataset(dataList, fields=FIELD)
    return exampleListObject
Ejemplo n.º 10
0
    def __init__(self, path, src_field, trg_field, sep='\t', **kwargs):
        """Create an dataset instance given a path and fields.
        Arguments:
            path: Path to the data file.
            src_field: The field that will be used for source data.
            trg_field: The field that will be used for destion data.
            kwargs: Passed to the constructor of data.Dataset.
        """

        fields = [('src', src_field), ('trg', trg_field)]
        examples = []
        with open(path, errors='ignore') as f:
            for line in f:
                s = line.strip().split(sep)
                if len(s) != 2:
                    continue

                src, trg = s[0], s[1]
                e = data.Example()
                setattr(e, "src", src_field.preprocess(src))
                setattr(e, "trg", trg_field.preprocess(trg))
                examples.append(e)

        super(TranslateDataset, self).__init__(examples, fields, **kwargs)