コード例 #1
0
def load_af(batch_size: int, min_freq: int = 10, nl_ratio: float = .5) -> \
        Tuple[tt.Iterator, tt.Iterator, tt.Dataset, tt.Field, tt.Field]:
    """
    Loads the Afrikaans data, augmented with Dutch data.

    :param batch_size: The size of the mini-batches
    :param min_freq: A word will only be added to the vocabulary if it
        occurs this many times in the data
    :param nl_ratio: The fraction of the training data that will be
        Dutch
    :return: Iterators for the three datasets, along with the Fields
        for words and POS tags. Only the training data will contain
        Dutch examples
    """
    if not 0 <= nl_ratio <= 1:
        raise ValueError("nl_ratio must be between 0 and 1")

    # Prepare fields
    text_field = tt.Field(init_token="<bos>", eos_token="<eos>", lower=True)
    tags_field = tt.Field(init_token="<bos>",
                          eos_token="<eos>",
                          unk_token=None)
    fields = (("text", text_field), ("udtags", tags_field))

    # Load data
    af = list(
        SequenceTaggingDataset.splits(path="data/af",
                                      fields=fields,
                                      train="train.txt",
                                      test="test.txt",
                                      validation="dev.txt"))
    nl_train = SequenceTaggingDataset("data/nl/nl.txt", fields)

    # Add Dutch examples
    max_nl_ratio = len(nl_train) / (len(af[0]) + len(nl_train))
    if nl_ratio <= max_nl_ratio:
        num_nl_examples = int(nl_ratio * len(af[0]) / (1. - nl_ratio))
        af[0].examples += nl_train.examples[:num_nl_examples]
    else:
        num_af_examples = int(len(nl_train) * (1. - nl_ratio) / nl_ratio)
        af[0].examples = af[0].examples[:num_af_examples] + nl_train.examples

    # Build vocab
    text_field.build_vocab(*af, min_freq=min_freq)
    tags_field.build_vocab(*af)

    device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
    iters = tt.BucketIterator.splits(af, batch_size=batch_size, device=device)

    return iters + (text_field, tags_field)
コード例 #2
0
 def load_data(self):
     ''' load data from file using torchtext '''
     if self.test:
         # built-in datasets
         if self.prefix == 'udpos':
             self.train_set, self.valid_set, self.test_set = UDPOS.splits(
                 fields=((('text', 'char'), (self.text_field,
                                             self.char_field)),
                         ('tag', self.tag_field), ('pos', None)),
                 root=self.data_path)
         if self.prefix == 'conll2000':
             self.train_set, self.valid_set, self.test_set = CoNLL2000Chunking.splits(
                 fields=((('text', 'char'), (self.text_field,
                                             self.char_field)),
                         ('pos', None), ('tag', self.tag_field)),
                 root=self.data_path)
     else:
         # load datasets from pre-prepared tsv files
         self.train_set, self.valid_set, self.test_set = SequenceTaggingDataset.splits(
             fields=((('text', 'char'), (self.text_field, self.char_field)),
                     ('tag', self.tag_field)),
             path=self.data_path + '/{}'.format(self.prefix),
             train='train.tsv',
             validation='dev.tsv',
             test='test.tsv')
コード例 #3
0
ファイル: tool.py プロジェクト: ttjjlw/19DG_competition
 def get_dataset(self, path: str, fields=Fields, separator='\t'):
     logger.info('loading dataset from {}'.format(path))
     st_dataset = SequenceTaggingDataset(path,
                                         fields=fields,
                                         separator=separator)
     logger.info('successed loading dataset')
     return st_dataset
コード例 #4
0
def get_dataset(base_path,
                batch_size,
                pretrained_embedding=None,
                is_inference=False):
    sentence = data.Field(lower=False, include_lengths=True, batch_first=True)
    char_nesting = data.Field(lower=False, tokenize=list)
    char_sentence = data.NestedField(char_nesting, include_lengths=True)
    tags = data.Field(batch_first=True)

    train, val, test = SequenceTaggingDataset.splits(
        path=base_path,
        train="train.txt",
        validation="dev.txt",
        test="test.txt",
        fields=[(("sentence", "char_sentence"), (sentence, char_sentence)),
                ("tags", tags)])
    tags.build_vocab(train.tags)
    if not pretrained_embedding:
        sentence.build_vocab(train.sentence, min_freq=5)
    else:
        sentence.build_vocab(train.sentence, vectors=pretrained_embedding)
    char_sentence.build_vocab(train.char_sentence)

    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test), [batch_size] * 3,
        repeat=False,
        shuffle=True,
        sort_key=lambda x: len(x.sentence),
        sort_within_batch=True)

    return sentence, char_sentence, tags, val_iter, train_iter, test_iter
コード例 #5
0
 def __init__(self, path='', glove_name='6B', glove_dim=300):
     fields = [
         ('text', Field(include_lengths=True, sequential=True)),
         ('label', Field(is_target=True, postprocessing=lambda X, voc: [x[0] - 2 for x in X]))
     ]
     self.train_set, self.dev_set = SequenceTaggingDataset.splits(path=path, train='train.tsv', validation='dev.tsv',fields = fields)
     self.fields = dict(fields)
     self.fields['text'].build_vocab(self.train_set, self.dev_set, vectors=vocab.GloVe(name=glove_name, dim=glove_dim))
     self.fields['label'].build_vocab(self.train_set, specials=[])
コード例 #6
0
 def __init__(self, input_folder, min_word_freq, batch_size, wv_file=None):
     # list all the fields
     self.word_field = Field(lower=True)  # [sent len, batch_size]
     self.tag_field = Field(unk_token=None)  # [sent len, batch_size]
     ### BEGIN MODIFIED SECTION: CHARACTER EMBEDDING ###
     self.char_nesting_field = Field(tokenize=list)
     self.char_field = NestedField(
         self.char_nesting_field)  # [batch_size, sent len, word len]
     # create dataset using built-in parser from torchtext
     self.train_dataset, self.test_dataset = SequenceTaggingDataset.splits(
         path=input_folder,
         train="train.txt",
         test="test.txt",
         fields=((("word", "char"), (self.word_field, self.char_field)),
                 ("tag", self.tag_field)))
     ### END MODIFIED SECTION ###
     # convert fields to vocabulary list
     if wv_file:
         self.wv_model = gensim.models.word2vec.Word2Vec.load(wv_file)
         self.embedding_dim = self.wv_model.vector_size
         word_freq = {
             word: self.wv_model.wv.vocab[word].count
             for word in self.wv_model.wv.vocab
         }
         word_counter = Counter(word_freq)
         self.word_field.vocab = Vocab(word_counter, min_freq=min_word_freq)
         vectors = []
         for word, idx in self.word_field.vocab.stoi.items():
             if word in self.wv_model.wv.vocab.keys():
                 vectors.append(
                     torch.as_tensor(self.wv_model.wv[word].tolist()))
             else:
                 vectors.append(torch.zeros(self.embedding_dim))
         self.word_field.vocab.set_vectors(stoi=self.word_field.vocab.stoi,
                                           vectors=vectors,
                                           dim=self.embedding_dim)
     else:
         self.word_field.build_vocab(self.train_dataset.word,
                                     min_freq=min_word_freq)
     # build vocab for tag and characters
     self.char_field.build_vocab(self.train_dataset.char)  # NEWLY ADDED
     self.tag_field.build_vocab(self.train_dataset.tag)
     # create iterator for batch input
     self.train_iter, self.test_iter = BucketIterator.splits(
         datasets=(self.train_dataset, self.test_dataset),
         batch_size=batch_size)
     # prepare padding index to be ignored during model training/evaluation
     self.word_pad_idx = self.word_field.vocab.stoi[
         self.word_field.pad_token]
     self.char_pad_idx = self.char_field.vocab.stoi[
         self.char_field.pad_token]  # NEWLY ADDED
     self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token]
コード例 #7
0
 def __init__(self, input_folder, min_word_freq, batch_size):
     # list all the fields
     self.word_field = Field(lower=True)
     self.tag_field = Field(unk_token=None)
     # create dataset using built-in parser from torchtext
     self.train_dataset, self.test_dataset = SequenceTaggingDataset.splits(
         path=input_folder,
         train="train.txt",
         test="test.txt",
         fields=(("word", self.word_field), ("tag", self.tag_field)))
     # convert fields to vocabulary list
     self.word_field.build_vocab(self.train_dataset.word,
                                 min_freq=min_word_freq)
     self.tag_field.build_vocab(self.train_dataset.tag)
     # create iterator for batch input
     self.train_iter, self.test_iter = BucketIterator.splits(
         datasets=(self.train_dataset, self.test_dataset),
         batch_size=batch_size)
     # prepare padding index to be ignored during model training/evaluation
     self.word_pad_idx = self.word_field.vocab.stoi[
         self.word_field.pad_token]
     self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token]
コード例 #8
0
    def __init__(self, config):
        self.batch_size = config.batch_size
        self.num_special_toks = 2  #for '<pad>' and '<unk>'
        self.label_type = config.label_type

        TEXT_WORD = CollField(pad_token='<pad>',
                              unk_token='<unk>',
                              tokenize=(lambda x: x),
                              sequential=True,
                              batch_first=True,
                              lower=True,
                              include_lengths=True)
        CHAR_NESTING = data.Field(pad_token='<c>',
                                  tokenize=list,
                                  batch_first=True)
        TEXT_CHAR = data.NestedField(CHAR_NESTING, include_lengths=True)
        NER_LABELS = data.Field(
            pad_token='<pad>',
            unk_token=None,
            batch_first=True,
            is_target=True,
            postprocessing=lambda arr, _: [[x - 1 for x in ex] for ex in arr])

        fields = ([(('word', 'char'),
                    (TEXT_WORD, TEXT_CHAR))] + [('ner', NER_LABELS)])

        train, val, test = SequenceTaggingDataset.splits(
            path=config.data_dir,
            train=config.train_file,
            validation=config.validation_file,
            test=config.test_file,
            separator=' ',
            fields=fields)
        train.examples = [
            ex for ex in train.examples if ex.word != [u'-DOCSTART-'.lower()]
        ]
        val.examples = [
            ex for ex in val.examples if ex.word != [u'-DOCSTART-'.lower()]
        ]
        test.examples = [
            ex for ex in test.examples if ex.word != [u'-DOCSTART-'.lower()]
        ]

        self.train_ds = train
        self.val_ds = val
        self.test_ds = test

        logging.info('Train size: %d' % (len(train)))
        logging.info('Validation size: %d' % (len(val)))
        logging.info('Test size: %d' % (len(test)))

        TEXT_CHAR.build_vocab(train.char, val.char, test.char)
        TEXT_WORD.build_vocab(train.word,
                              val.word,
                              test.word,
                              max_size=50000,
                              vectors=[GloVe(name='6B', dim='50')])

        NER_LABELS.build_vocab(train.ner)

        self.TEXT_WORD = TEXT_WORD
        self.char_vocab = TEXT_CHAR.vocab
        self.NER_LABELS = NER_LABELS

        self.labels = self.NER_LABELS.vocab.itos[1:]

        logging.info('Input word vocab size:%d' % (len(self.TEXT_WORD.vocab)))
        logging.info('Input char vocab size:%d' % (len(self.char_vocab)))
        logging.info('NER Tagset size: %d' % (len(self.labels)))

        self.sort_key = lambda x: len(x.word)
コード例 #9
0
ファイル: dataloader.py プロジェクト: jnepal/nepali-ner
    def __init__(self, config, k):
        self.root_path = os.path.join(config.root_path, k)
        self.batch_size = config.batch_size
        self.device = config.device
        self.use_pos = config.use_pos

        self.txt_field = data.Field(tokenize=list,
                                    use_vocab=True,
                                    unk_token='<unk>',
                                    batch_first=True)
        self.label_field = data.Field(unk_token=None, batch_first=True)
        self.char_field = data.Field(unk_token='<unk>', sequential=False)
        self.graph_field = data.Field(unk_token='<unk>', sequential=False)

        self.fields = (('TEXT', self.txt_field), ('LABEL', self.label_field))

        if config.use_pos:
            self.pos_field = data.Field(unk_token=None, batch_first=True)
            self.fields = (('TEXT', self.txt_field), ('POS', self.pos_field),
                           ('LABEL', self.label_field))

        self.train_ds, self.val_ds, self.test_ds = SequenceTaggingDataset.splits(
            path=self.root_path,
            fields=self.fields,
            separator='\t',
            train='train.txt',
            validation='val.txt',
            test='test.txt')

        self.char_list = []
        self.graph_list = []
        for each in self.train_ds.examples + self.test_ds.examples + self.val_ds.examples:
            for x in each.TEXT:
                self.char_list += list(x)
                self.graph_list += list(grapheme_clusters(x))
        self.char_list = list(set(self.char_list))
        self.graph_list = list(set(self.graph_list))

        self.graph_list.sort()
        self.char_list.sort()

        self.char_field.build_vocab(self.char_list)
        self.graph_field.build_vocab(self.graph_list)

        self.embedding_dir = config.emb_dir
        self.vec = vocab.Vectors(name=config.emb_file,
                                 cache=self.embedding_dir)

        self.txt_field.build_vocab(self.train_ds,
                                   self.test_ds,
                                   self.val_ds,
                                   max_size=None,
                                   vectors=self.vec)
        self.label_field.build_vocab(self.train_ds.LABEL, self.test_ds.LABEL,
                                     self.val_ds.LABEL)

        if config.char_pretrained:
            self.char_vec = vocab.Vectors(name=config.char_emb_file,
                                          cache=self.embedding_dir)
            self.graph_vec = vocab.Vectors(name=config.graph_emb_file,
                                           cache=self.embedding_dir)

            self.char_field.build_vocab(self.char_list, vectors=self.char_vec)
            self.graph_field.build_vocab(self.graph_list,
                                         vectors=self.graph_vec)
        else:
            self.char_field.build_vocab(self.char_list)
            self.graph_field.build_vocab(self.graph_list)

        self.vocab_size = len(self.txt_field.vocab)
        self.tagset_size = len(self.label_field.vocab)
        self.char_vocab_size = len(self.char_field.vocab)
        self.graph_vocab_size = len(self.graph_field.vocab)

        self.weights = self.txt_field.vocab.vectors
        self.char_weights = self.char_field.vocab.vectors
        self.graph_weights = self.graph_field.vocab.vectors

        if config.use_pos:
            self.pos_field.build_vocab(self.train_ds.POS, self.test_ds.POS,
                                       self.val_ds.POS)
            # Because len(pos) = 56 and len(pos_field.vocab) = 55
            self.pos_size = len(self.pos_field.vocab) + 2
            self.pos_one_hot = np.eye(self.pos_size)
            self.one_hot_weight = torch.from_numpy(self.pos_one_hot).float()

        if config.verbose:
            self.print_stat()
コード例 #10
0
    def run(self):
        """Preprocess and eval the model.

        """
        # Extract Fields from a CONLL dataset file
        TEXT = torchtext.data.Field(lower=False,
                                    include_lengths=True,
                                    batch_first=True)
        LABEL = torchtext.data.Field(batch_first=True, unk_token=None)
        FIELDS = [("text", TEXT), ("label", LABEL)]
        train_data, eval_data, test_data = NoReCfine.splits(FIELDS)
        data = SequenceTaggingDataset(self.data_path,
                                      FIELDS,
                                      encoding="utf-8",
                                      separator="\t")

        # Build the vocabulary
        VOCAB_SIZE = 1_200_000
        VECTORS = Vectors(name='model.txt',
                          url='http://vectors.nlpl.eu/repository/20/58.zip')
        # Create the vocabulary for words embeddings
        TEXT.build_vocab(train_data,
                         max_size=VOCAB_SIZE,
                         vectors=VECTORS,
                         unk_init=torch.Tensor.normal_)
        LABEL.build_vocab(train_data)

        # General information
        text_length = [len(sentence) for sentence in list(data.text)]
        print(
            f"\nNumber of sentences in {self.data_path}: {len(text_length):,}")
        print(f'Number of words in {self.data_path}: {sum(text_length):,}')

        # Generate iterator made of 1 example
        BATCH_SIZE = 1
        device = torch.device(self.device)
        iterator = torchtext.data.BucketIterator(data,
                                                 batch_size=BATCH_SIZE,
                                                 sort_within_batch=True,
                                                 device=device)

        # Loss function
        criterion = nn.CrossEntropyLoss(ignore_index=0,
                                        weight=torch.tensor([
                                            1, 0.06771941, 0.97660534,
                                            0.97719714, 0.98922782, 0.98925029
                                        ]))

        # Load the model
        model = torch.load(self.model_path)
        # Make sure the dictionary containing performances / scores is empty before running the eval method
        # model.reset()
        performance = model.evaluate(iterator, criterion, verbose=True)
        print(describe_dict(performance, sep_key=' | ', sep_val=': ',
                            pad=True))
        confusion = ConfusionMatrix(data=performance['confusion'])
        print("confusion matrix:")
        print(
            np.array2string(confusion.normalize(),
                            separator=',  ',
                            precision=3,
                            floatmode='fixed'))
コード例 #11
0
    def __init__(self, args):
        # list all the fields
        self.word_field = Field(lower=True)
        self.event_field = Field(unk_token=None)
        self.entity_field = Field(unk_token=None)
        self.argument_field = Field(unk_token=None)
        self.trigger_pos_field = Field(unk_token=None)
        self.char_nesting_field = Field(tokenize=list)
        self.char_field = NestedField(self.char_nesting_field)

        self.wv = args.wv_file
        # create dataset using built-in parser from torchtext
        self.train_dataset, self.val_dataset, self.test_dataset = SequenceTaggingDataset.splits(
            path=args.input_folder,
            train="train.txt",
            validation="dev.txt",
            test="test.txt",
            fields=((("word", "char"), (self.word_field, self.char_field)),
                    ("event", self.event_field), ("entity", self.entity_field),
                    ("argument", self.argument_field),
                    ("trigger_pos", self.trigger_pos_field)),
        )
        # convert fields to vocabulary list
        # self.word_field.build_vocab(self.train_dataset.word, min_freq=min_word_freq)
        self.event_field.build_vocab(self.train_dataset.event)
        # create iterator for batch input

        if args.wv_file:
            print("start loading embedding")
            self.wv_model = gensim.models.KeyedVectors.load_word2vec_format(
                args.wv_file, binary=True)
            print("done loading embedding")
            self.embedding_dim = self.wv_model.vector_size
            word_freq = {
                word: self.wv_model.wv.vocab[word].count
                for word in self.wv_model.wv.vocab
            }
            word_counter = Counter(word_freq)
            self.word_field.vocab = Vocab(word_counter,
                                          min_freq=args.min_word_freq)
            # mapping each vector/embedding from word2vec model to word_field vocabs
            vectors = []
            print("start loading vec", len(self.word_field.vocab.stoi))
            for word, idx in self.word_field.vocab.stoi.items():
                if word in self.wv_model.wv.vocab.keys():
                    vectors.append(
                        torch.as_tensor(self.wv_model.wv[word].tolist()))
                else:
                    vectors.append(torch.zeros(self.embedding_dim))
            print("done loading vec")
            del self.wv_model
            self.word_field.vocab.set_vectors(
                stoi=self.word_field.vocab.stoi,
                # list of vector embedding, orderred according to word_field.vocab
                vectors=vectors,
                dim=self.embedding_dim)

        else:
            self.word_field.build_vocab(self.train_dataset.word,
                                        min_freq=args.min_word_freq)
        self.char_field.build_vocab(self.train_dataset.char)
        self.entity_field.build_vocab(self.train_dataset.entity)
        self.argument_field.build_vocab(self.train_dataset.argument)
        self.trigger_pos_field.build_vocab(self.train_dataset.trigger_pos)

        self.train_iter, self.val_iter, self.test_iter = BucketIterator.splits(
            datasets=(self.train_dataset, self.val_dataset, self.test_dataset),
            batch_size=args.batch_size,
            shuffle=False,
        )

        # prepare padding index to be ignored during model training/evaluation
        self.word_pad_idx = self.word_field.vocab.stoi[
            self.word_field.pad_token]
        self.event_pad_idx = self.event_field.vocab.stoi[
            self.event_field.pad_token]
        self.char_pad_idx = self.char_field.vocab.stoi[
            self.char_field.pad_token]
        self.entity_pad_idx = self.entity_field.vocab.stoi[
            self.entity_field.pad_token]
        self.argument_pad_idx = self.entity_field.vocab.stoi[
            self.entity_field.pad_token]
コード例 #12
0
def test_inference_performance():
    from sklearn.metrics import f1_score
    from torchtext.datasets import SequenceTaggingDataset
    from torchtext.data import Field, NestedField

    WORD = Field(init_token='<bos>', eos_token='<eos>')
    CHAR_NESTING = Field(tokenize=list, init_token='<bos>', eos_token='<eos>')
    CHAR = NestedField(CHAR_NESTING, init_token='<bos>', eos_token='<eos>')
    ENTITY = Field(init_token='<bos>', eos_token='<eos>')

    data_file = tempfile.NamedTemporaryFile(delete=True)

    # TODO Need to be decoded in Python 3
    data_file.write(requests.get(CORA_URL).content)

    fields = [(('text', 'char'),
               (WORD, CHAR))] + [(None, None)] * 22 + [('entity', ENTITY)]

    dataset = SequenceTaggingDataset(data_file.name, fields, separator=" ")

    model = Model(model_path='models/neuralParsCit')
    model.parameters['pre_emb'] = os.path.join(os.getcwd(),
                                               'vectors_with_unk.kv')
    f = model.build(training=False, **model.parameters)

    model.reload()

    word_to_id = {v: i for i, v in model.id_to_word.items()}
    char_to_id = {v: i for i, v in model.id_to_char.items()}
    tag_to_id = {tag: i for i, tag in model.id_to_tag.items()}

    tf = tempfile.NamedTemporaryFile(delete=False)
    tf.write("\n\n".join(
        ["\n".join(example.text) for example in dataset.examples]))
    tf.close()

    train_sentences = load_sentences(tf.name, model.parameters['lower'],
                                     model.parameters['zeros'])

    train_inputs = prepare_dataset(train_sentences, word_to_id, char_to_id,
                                   model.parameters['lower'], True)

    preds = []

    for citation in train_inputs:
        inputs = create_input(citation, model.parameters, False)
        y_pred = np.array(f[1](*inputs))[1:-1]

        preds.append([(w, y_pred[i])
                      for i, w in enumerate(citation['str_words'])])

    assert len(preds) == len(dataset.examples)

    results = []

    for P, T in zip(preds, dataset.examples):
        for p, t in zip(P, zip(T.text, T.entity)):
            results.append((p[1], tag_to_id[t[1]]))

    pred, true = zip(*results)

    eval_metrics = {
        'micro_f1': f1_score(true, pred, average='micro'),
        'macro_f1': f1_score(true, pred, average='macro')
    }

    data_file.close()

    assert eval_metrics == pytest.approx({
        'macro_f1': 0.984,
        'micro_f1': 0.993
    },
                                         abs=0.001)
コード例 #13
0
ファイル: nyt.py プロジェクト: yushu-liu/torchnlp
def nyt_ingredients_ner_dataset(batch_size,
                                use_local=False,
                                root='.data/nyt_ingredients_ner',
                                train_file='train.txt',
                                validation_file='valid.txt',
                                test_file='test.txt',
                                convert_digits=True):
    """
    nyt_ingredients_ner: New York Times Ingredient tagging dataset
    Extract NYT ingredients dataset using torchtext. Applies GloVe 6B.200d and Char N-gram
    pretrained vectors. Also sets up per word character Field
    Parameters:
        batch_size: Batch size to return from iterator
        use_local: If True use local provided files (default False)
        root: Dataset root directory
        train_file: Train filename
        validation_file: Validation filename
        test_file: Test filename
        convert_digits: If True will convert numbers to single 0's

    Returns:
        A dict containing:
            task: 'nyt_ingredients.ner'
            iters: (train iter, validation iter, test iter)
            vocabs: (Inputs word vocabulary, Inputs character vocabulary, 
                    Tag vocabulary )
    """

    # Setup fields with batch dimension first
    inputs_word = data.Field(
        init_token="<bos>",
        eos_token="<eos>",
        batch_first=True,
        lower=True,
        preprocessing=data.Pipeline(lambda w: '0'
                                    if convert_digits and w.isdigit() else w))

    inputs_char_nesting = data.Field(tokenize=list,
                                     init_token="<bos>",
                                     eos_token="<eos>",
                                     batch_first=True)

    inputs_char = data.NestedField(inputs_char_nesting,
                                   init_token="<bos>",
                                   eos_token="<eos>")

    labels = data.Field(init_token="<bos>",
                        eos_token="<eos>",
                        batch_first=True)

    fields = ([(('inputs_word', 'inputs_char'), (inputs_word, inputs_char)),
               ('labels', labels)])

    # Load the data
    if use_local:
        train, val, test = SequenceTaggingDataset.splits(
            path=root,
            train=train_file,
            validation=validation_file,
            test=test_file,
            fields=tuple(fields))
    else:
        train, val, test = Ingredients.splits(fields=tuple(fields))

    logger.info('---------- NYT INGREDIENTS NER ---------')
    logger.info('Train size: %d' % (len(train)))
    logger.info('Validation size: %d' % (len(val)))
    logger.info('Test size: %d' % (len(test)))

    # Build vocab
    inputs_char.build_vocab(train.inputs_char, val.inputs_char,
                            test.inputs_char)
    inputs_word.build_vocab(train.inputs_word,
                            val.inputs_word,
                            test.inputs_word,
                            max_size=50000,
                            vectors=[GloVe(name='6B', dim='200'),
                                     CharNGram()])

    labels.build_vocab(train.labels)
    logger.info('Input vocab size:%d' % (len(inputs_word.vocab)))
    logger.info('Tagset size: %d' % (len(labels.vocab)))

    # Get iterators
    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test),
        batch_size=batch_size,
        device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
    train_iter.repeat = False

    return {
        'task': 'nyt_ingredients.ner',
        'iters': (train_iter, val_iter, test_iter),
        'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab)
    }
コード例 #14
0
def conll2003_dataset(tag_type, batch_size, root='./conll2003',
                      train_file='eng.train.txt',
                      validation_file='eng.testa.txt',
                      test_file='eng.testb.txt',
                      convert_digits=True):
    """
    conll2003: Conll 2003 (Parser only. You must place the files)
    Extract Conll2003 dataset using torchtext. Applies GloVe 6B.200d and Char N-gram
    pretrained vectors. Also sets up per word character Field
    Parameters:
        tag_type: Type of tag to pick as task [pos, chunk, ner]
        batch_size: Batch size to return from iterator
        root: Dataset root directory
        train_file: Train filename
        validation_file: Validation filename
        test_file: Test filename
        convert_digits: If True will convert numbers to single 0's

    Returns:
        A dict containing:
            task: 'conll2003.' + tag_type
            iters: (train iter, validation iter, test iter)
            vocabs: (Inputs word vocabulary, Inputs character vocabulary,
                    Tag vocabulary )
    """

    # Setup fields with batch dimension first
    inputs_word = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, lower=True,
                             preprocessing=data.Pipeline(
                                 lambda w: '0' if convert_digits and w.isdigit() else w))

    inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>",
                                     batch_first=True)

    inputs_char = data.NestedField(inputs_char_nesting,
                                   init_token="<bos>", eos_token="<eos>")

    labels = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True)

    fields = ([(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))] +
              [('labels', labels) if label == tag_type else (None, None)
               for label in ['pos', 'chunk', 'ner']])

    # Load the data
    train, val, test = SequenceTaggingDataset.splits(
        path=root,
        train=train_file,
        validation=validation_file,
        test=test_file,
        separator=' ',
        fields=tuple(fields))

    logger.info('---------- CONLL 2003 %s ---------' % tag_type)
    logger.info('Train size: %d' % (len(train)))
    logger.info('Validation size: %d' % (len(val)))
    logger.info('Test size: %d' % (len(test)))

    # Build vocab
    inputs_char.build_vocab(train.inputs_char, val.inputs_char, test.inputs_char)
    inputs_word.build_vocab(train.inputs_word, val.inputs_word, test.inputs_word, max_size=50000,
                            vectors=[GloVe(name='6B', dim='200'), CharNGram()])

    labels.build_vocab(train.labels)
    logger.info('Input vocab size:%d' % (len(inputs_word.vocab)))
    logger.info('Tagset size: %d' % (len(labels.vocab)))

    # Get iterators
    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test), batch_size=batch_size,
        device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
    train_iter.repeat = False

    return {
        'task': 'conll2003.%s' % tag_type,
        'iters': (train_iter, val_iter, test_iter),
        'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab)
    }
コード例 #15
0
ファイル: conll.py プロジェクト: yushu-liu/torchnlp
def conll2000_dataset(batch_size,
                      use_local=False,
                      root='.data/conll2000',
                      train_file='train.txt',
                      test_file='test.txt',
                      validation_frac=0.1,
                      convert_digits=True):
    """
    conll2000: Conll 2000 (Chunking)
    Extract Conll2000 Chunking dataset using torchtext. By default will fetch
    data files from online repository.
    Applies GloVe 6B.200d and Char N-gram pretrained vectors. Also sets 
    up per word character Field
    Parameters:
        batch_size: Batch size to return from iterator
        use_local: If True use local provided files (default False)
        root (optional): Dataset root directory (needed only if use_local is True)
        train_file (optional): Train filename (needed only if use_local is True)
        test_file (optional): Test filename (needed only if use_local is True)
        validation_frac (optional): Fraction of train dataset to use for validation
        convert_digits (optional): If True will convert numbers to single 0's
    NOTE: Since there is only a train and test set we use 10% of the train set as
        validation
    Returns:
        A dict containing:
            task: 'conll2000.' + tag_type
            iters: (train iter, validation iter, None)
            vocabs: (Inputs word vocabulary, Inputs character vocabulary, 
                    Tag vocabulary )
    """

    # Setup fields with batch dimension first
    inputs_word = data.Field(
        init_token="<bos>",
        eos_token="<eos>",
        batch_first=True,
        lower=True,
        preprocessing=data.Pipeline(lambda w: '0'
                                    if convert_digits and w.isdigit() else w))

    inputs_char_nesting = data.Field(tokenize=list,
                                     init_token="<bos>",
                                     eos_token="<eos>",
                                     batch_first=True)

    inputs_char = data.NestedField(inputs_char_nesting,
                                   init_token="<bos>",
                                   eos_token="<eos>")

    labels = data.Field(init_token="<bos>",
                        eos_token="<eos>",
                        batch_first=True)

    fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char)),
              (None, None), ('labels', labels)]

    if use_local:
        # Load the data
        train, test = SequenceTaggingDataset.splits(path=root,
                                                    train=train_file,
                                                    test=test_file,
                                                    fields=tuple(fields))

        # HACK: Saving the sort key function as the split() call removes it
        sort_key = train.sort_key
        # To make the split deterministic
        random.seed(0)
        train, val = train.split(1 - validation_frac,
                                 random_state=random.getstate())
        # Reset the seed
        random.seed()

        # HACK: Set the sort key
        train.sort_key = sort_key
        val.sort_key = sort_key
    else:
        train, val, test = CoNLL2000Chunking.splits(
            fields=tuple(fields), validation_frac=validation_frac)

    logger.info('---------- CONLL 2000 Chunking ---------')
    logger.info('Train size: %d' % (len(train)))
    logger.info('Validation size: %d' % (len(val)))
    logger.info('Test size: %d' % (len(test)))

    # Build vocab
    inputs_char.build_vocab(train.inputs_char, val.inputs_char,
                            test.inputs_char)
    inputs_word.build_vocab(train.inputs_word,
                            val.inputs_word,
                            test.inputs_word,
                            max_size=50000,
                            vectors=[GloVe(name='6B', dim='200'),
                                     CharNGram()])

    labels.build_vocab(train.labels)
    logger.info('Input vocab size:%d' % (len(inputs_word.vocab)))
    logger.info('Tagset size: %d' % (len(labels.vocab)))

    # Get iterators
    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test),
        batch_size=batch_size,
        device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
    train_iter.repeat = False

    return {
        'task': 'conll2000.chunk',
        'iters': (train_iter, val_iter, test_iter),
        'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab)
    }