def load_af(batch_size: int, min_freq: int = 10, nl_ratio: float = .5) -> \ Tuple[tt.Iterator, tt.Iterator, tt.Dataset, tt.Field, tt.Field]: """ Loads the Afrikaans data, augmented with Dutch data. :param batch_size: The size of the mini-batches :param min_freq: A word will only be added to the vocabulary if it occurs this many times in the data :param nl_ratio: The fraction of the training data that will be Dutch :return: Iterators for the three datasets, along with the Fields for words and POS tags. Only the training data will contain Dutch examples """ if not 0 <= nl_ratio <= 1: raise ValueError("nl_ratio must be between 0 and 1") # Prepare fields text_field = tt.Field(init_token="<bos>", eos_token="<eos>", lower=True) tags_field = tt.Field(init_token="<bos>", eos_token="<eos>", unk_token=None) fields = (("text", text_field), ("udtags", tags_field)) # Load data af = list( SequenceTaggingDataset.splits(path="data/af", fields=fields, train="train.txt", test="test.txt", validation="dev.txt")) nl_train = SequenceTaggingDataset("data/nl/nl.txt", fields) # Add Dutch examples max_nl_ratio = len(nl_train) / (len(af[0]) + len(nl_train)) if nl_ratio <= max_nl_ratio: num_nl_examples = int(nl_ratio * len(af[0]) / (1. - nl_ratio)) af[0].examples += nl_train.examples[:num_nl_examples] else: num_af_examples = int(len(nl_train) * (1. - nl_ratio) / nl_ratio) af[0].examples = af[0].examples[:num_af_examples] + nl_train.examples # Build vocab text_field.build_vocab(*af, min_freq=min_freq) tags_field.build_vocab(*af) device = torch.device('cuda' if torch.cuda.is_available() else "cpu") iters = tt.BucketIterator.splits(af, batch_size=batch_size, device=device) return iters + (text_field, tags_field)
def load_data(self): ''' load data from file using torchtext ''' if self.test: # built-in datasets if self.prefix == 'udpos': self.train_set, self.valid_set, self.test_set = UDPOS.splits( fields=((('text', 'char'), (self.text_field, self.char_field)), ('tag', self.tag_field), ('pos', None)), root=self.data_path) if self.prefix == 'conll2000': self.train_set, self.valid_set, self.test_set = CoNLL2000Chunking.splits( fields=((('text', 'char'), (self.text_field, self.char_field)), ('pos', None), ('tag', self.tag_field)), root=self.data_path) else: # load datasets from pre-prepared tsv files self.train_set, self.valid_set, self.test_set = SequenceTaggingDataset.splits( fields=((('text', 'char'), (self.text_field, self.char_field)), ('tag', self.tag_field)), path=self.data_path + '/{}'.format(self.prefix), train='train.tsv', validation='dev.tsv', test='test.tsv')
def get_dataset(self, path: str, fields=Fields, separator='\t'): logger.info('loading dataset from {}'.format(path)) st_dataset = SequenceTaggingDataset(path, fields=fields, separator=separator) logger.info('successed loading dataset') return st_dataset
def get_dataset(base_path, batch_size, pretrained_embedding=None, is_inference=False): sentence = data.Field(lower=False, include_lengths=True, batch_first=True) char_nesting = data.Field(lower=False, tokenize=list) char_sentence = data.NestedField(char_nesting, include_lengths=True) tags = data.Field(batch_first=True) train, val, test = SequenceTaggingDataset.splits( path=base_path, train="train.txt", validation="dev.txt", test="test.txt", fields=[(("sentence", "char_sentence"), (sentence, char_sentence)), ("tags", tags)]) tags.build_vocab(train.tags) if not pretrained_embedding: sentence.build_vocab(train.sentence, min_freq=5) else: sentence.build_vocab(train.sentence, vectors=pretrained_embedding) char_sentence.build_vocab(train.char_sentence) train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), [batch_size] * 3, repeat=False, shuffle=True, sort_key=lambda x: len(x.sentence), sort_within_batch=True) return sentence, char_sentence, tags, val_iter, train_iter, test_iter
def __init__(self, path='', glove_name='6B', glove_dim=300): fields = [ ('text', Field(include_lengths=True, sequential=True)), ('label', Field(is_target=True, postprocessing=lambda X, voc: [x[0] - 2 for x in X])) ] self.train_set, self.dev_set = SequenceTaggingDataset.splits(path=path, train='train.tsv', validation='dev.tsv',fields = fields) self.fields = dict(fields) self.fields['text'].build_vocab(self.train_set, self.dev_set, vectors=vocab.GloVe(name=glove_name, dim=glove_dim)) self.fields['label'].build_vocab(self.train_set, specials=[])
def __init__(self, input_folder, min_word_freq, batch_size, wv_file=None): # list all the fields self.word_field = Field(lower=True) # [sent len, batch_size] self.tag_field = Field(unk_token=None) # [sent len, batch_size] ### BEGIN MODIFIED SECTION: CHARACTER EMBEDDING ### self.char_nesting_field = Field(tokenize=list) self.char_field = NestedField( self.char_nesting_field) # [batch_size, sent len, word len] # create dataset using built-in parser from torchtext self.train_dataset, self.test_dataset = SequenceTaggingDataset.splits( path=input_folder, train="train.txt", test="test.txt", fields=((("word", "char"), (self.word_field, self.char_field)), ("tag", self.tag_field))) ### END MODIFIED SECTION ### # convert fields to vocabulary list if wv_file: self.wv_model = gensim.models.word2vec.Word2Vec.load(wv_file) self.embedding_dim = self.wv_model.vector_size word_freq = { word: self.wv_model.wv.vocab[word].count for word in self.wv_model.wv.vocab } word_counter = Counter(word_freq) self.word_field.vocab = Vocab(word_counter, min_freq=min_word_freq) vectors = [] for word, idx in self.word_field.vocab.stoi.items(): if word in self.wv_model.wv.vocab.keys(): vectors.append( torch.as_tensor(self.wv_model.wv[word].tolist())) else: vectors.append(torch.zeros(self.embedding_dim)) self.word_field.vocab.set_vectors(stoi=self.word_field.vocab.stoi, vectors=vectors, dim=self.embedding_dim) else: self.word_field.build_vocab(self.train_dataset.word, min_freq=min_word_freq) # build vocab for tag and characters self.char_field.build_vocab(self.train_dataset.char) # NEWLY ADDED self.tag_field.build_vocab(self.train_dataset.tag) # create iterator for batch input self.train_iter, self.test_iter = BucketIterator.splits( datasets=(self.train_dataset, self.test_dataset), batch_size=batch_size) # prepare padding index to be ignored during model training/evaluation self.word_pad_idx = self.word_field.vocab.stoi[ self.word_field.pad_token] self.char_pad_idx = self.char_field.vocab.stoi[ self.char_field.pad_token] # NEWLY ADDED self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token]
def __init__(self, input_folder, min_word_freq, batch_size): # list all the fields self.word_field = Field(lower=True) self.tag_field = Field(unk_token=None) # create dataset using built-in parser from torchtext self.train_dataset, self.test_dataset = SequenceTaggingDataset.splits( path=input_folder, train="train.txt", test="test.txt", fields=(("word", self.word_field), ("tag", self.tag_field))) # convert fields to vocabulary list self.word_field.build_vocab(self.train_dataset.word, min_freq=min_word_freq) self.tag_field.build_vocab(self.train_dataset.tag) # create iterator for batch input self.train_iter, self.test_iter = BucketIterator.splits( datasets=(self.train_dataset, self.test_dataset), batch_size=batch_size) # prepare padding index to be ignored during model training/evaluation self.word_pad_idx = self.word_field.vocab.stoi[ self.word_field.pad_token] self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token]
def __init__(self, config): self.batch_size = config.batch_size self.num_special_toks = 2 #for '<pad>' and '<unk>' self.label_type = config.label_type TEXT_WORD = CollField(pad_token='<pad>', unk_token='<unk>', tokenize=(lambda x: x), sequential=True, batch_first=True, lower=True, include_lengths=True) CHAR_NESTING = data.Field(pad_token='<c>', tokenize=list, batch_first=True) TEXT_CHAR = data.NestedField(CHAR_NESTING, include_lengths=True) NER_LABELS = data.Field( pad_token='<pad>', unk_token=None, batch_first=True, is_target=True, postprocessing=lambda arr, _: [[x - 1 for x in ex] for ex in arr]) fields = ([(('word', 'char'), (TEXT_WORD, TEXT_CHAR))] + [('ner', NER_LABELS)]) train, val, test = SequenceTaggingDataset.splits( path=config.data_dir, train=config.train_file, validation=config.validation_file, test=config.test_file, separator=' ', fields=fields) train.examples = [ ex for ex in train.examples if ex.word != [u'-DOCSTART-'.lower()] ] val.examples = [ ex for ex in val.examples if ex.word != [u'-DOCSTART-'.lower()] ] test.examples = [ ex for ex in test.examples if ex.word != [u'-DOCSTART-'.lower()] ] self.train_ds = train self.val_ds = val self.test_ds = test logging.info('Train size: %d' % (len(train))) logging.info('Validation size: %d' % (len(val))) logging.info('Test size: %d' % (len(test))) TEXT_CHAR.build_vocab(train.char, val.char, test.char) TEXT_WORD.build_vocab(train.word, val.word, test.word, max_size=50000, vectors=[GloVe(name='6B', dim='50')]) NER_LABELS.build_vocab(train.ner) self.TEXT_WORD = TEXT_WORD self.char_vocab = TEXT_CHAR.vocab self.NER_LABELS = NER_LABELS self.labels = self.NER_LABELS.vocab.itos[1:] logging.info('Input word vocab size:%d' % (len(self.TEXT_WORD.vocab))) logging.info('Input char vocab size:%d' % (len(self.char_vocab))) logging.info('NER Tagset size: %d' % (len(self.labels))) self.sort_key = lambda x: len(x.word)
def __init__(self, config, k): self.root_path = os.path.join(config.root_path, k) self.batch_size = config.batch_size self.device = config.device self.use_pos = config.use_pos self.txt_field = data.Field(tokenize=list, use_vocab=True, unk_token='<unk>', batch_first=True) self.label_field = data.Field(unk_token=None, batch_first=True) self.char_field = data.Field(unk_token='<unk>', sequential=False) self.graph_field = data.Field(unk_token='<unk>', sequential=False) self.fields = (('TEXT', self.txt_field), ('LABEL', self.label_field)) if config.use_pos: self.pos_field = data.Field(unk_token=None, batch_first=True) self.fields = (('TEXT', self.txt_field), ('POS', self.pos_field), ('LABEL', self.label_field)) self.train_ds, self.val_ds, self.test_ds = SequenceTaggingDataset.splits( path=self.root_path, fields=self.fields, separator='\t', train='train.txt', validation='val.txt', test='test.txt') self.char_list = [] self.graph_list = [] for each in self.train_ds.examples + self.test_ds.examples + self.val_ds.examples: for x in each.TEXT: self.char_list += list(x) self.graph_list += list(grapheme_clusters(x)) self.char_list = list(set(self.char_list)) self.graph_list = list(set(self.graph_list)) self.graph_list.sort() self.char_list.sort() self.char_field.build_vocab(self.char_list) self.graph_field.build_vocab(self.graph_list) self.embedding_dir = config.emb_dir self.vec = vocab.Vectors(name=config.emb_file, cache=self.embedding_dir) self.txt_field.build_vocab(self.train_ds, self.test_ds, self.val_ds, max_size=None, vectors=self.vec) self.label_field.build_vocab(self.train_ds.LABEL, self.test_ds.LABEL, self.val_ds.LABEL) if config.char_pretrained: self.char_vec = vocab.Vectors(name=config.char_emb_file, cache=self.embedding_dir) self.graph_vec = vocab.Vectors(name=config.graph_emb_file, cache=self.embedding_dir) self.char_field.build_vocab(self.char_list, vectors=self.char_vec) self.graph_field.build_vocab(self.graph_list, vectors=self.graph_vec) else: self.char_field.build_vocab(self.char_list) self.graph_field.build_vocab(self.graph_list) self.vocab_size = len(self.txt_field.vocab) self.tagset_size = len(self.label_field.vocab) self.char_vocab_size = len(self.char_field.vocab) self.graph_vocab_size = len(self.graph_field.vocab) self.weights = self.txt_field.vocab.vectors self.char_weights = self.char_field.vocab.vectors self.graph_weights = self.graph_field.vocab.vectors if config.use_pos: self.pos_field.build_vocab(self.train_ds.POS, self.test_ds.POS, self.val_ds.POS) # Because len(pos) = 56 and len(pos_field.vocab) = 55 self.pos_size = len(self.pos_field.vocab) + 2 self.pos_one_hot = np.eye(self.pos_size) self.one_hot_weight = torch.from_numpy(self.pos_one_hot).float() if config.verbose: self.print_stat()
def run(self): """Preprocess and eval the model. """ # Extract Fields from a CONLL dataset file TEXT = torchtext.data.Field(lower=False, include_lengths=True, batch_first=True) LABEL = torchtext.data.Field(batch_first=True, unk_token=None) FIELDS = [("text", TEXT), ("label", LABEL)] train_data, eval_data, test_data = NoReCfine.splits(FIELDS) data = SequenceTaggingDataset(self.data_path, FIELDS, encoding="utf-8", separator="\t") # Build the vocabulary VOCAB_SIZE = 1_200_000 VECTORS = Vectors(name='model.txt', url='http://vectors.nlpl.eu/repository/20/58.zip') # Create the vocabulary for words embeddings TEXT.build_vocab(train_data, max_size=VOCAB_SIZE, vectors=VECTORS, unk_init=torch.Tensor.normal_) LABEL.build_vocab(train_data) # General information text_length = [len(sentence) for sentence in list(data.text)] print( f"\nNumber of sentences in {self.data_path}: {len(text_length):,}") print(f'Number of words in {self.data_path}: {sum(text_length):,}') # Generate iterator made of 1 example BATCH_SIZE = 1 device = torch.device(self.device) iterator = torchtext.data.BucketIterator(data, batch_size=BATCH_SIZE, sort_within_batch=True, device=device) # Loss function criterion = nn.CrossEntropyLoss(ignore_index=0, weight=torch.tensor([ 1, 0.06771941, 0.97660534, 0.97719714, 0.98922782, 0.98925029 ])) # Load the model model = torch.load(self.model_path) # Make sure the dictionary containing performances / scores is empty before running the eval method # model.reset() performance = model.evaluate(iterator, criterion, verbose=True) print(describe_dict(performance, sep_key=' | ', sep_val=': ', pad=True)) confusion = ConfusionMatrix(data=performance['confusion']) print("confusion matrix:") print( np.array2string(confusion.normalize(), separator=', ', precision=3, floatmode='fixed'))
def __init__(self, args): # list all the fields self.word_field = Field(lower=True) self.event_field = Field(unk_token=None) self.entity_field = Field(unk_token=None) self.argument_field = Field(unk_token=None) self.trigger_pos_field = Field(unk_token=None) self.char_nesting_field = Field(tokenize=list) self.char_field = NestedField(self.char_nesting_field) self.wv = args.wv_file # create dataset using built-in parser from torchtext self.train_dataset, self.val_dataset, self.test_dataset = SequenceTaggingDataset.splits( path=args.input_folder, train="train.txt", validation="dev.txt", test="test.txt", fields=((("word", "char"), (self.word_field, self.char_field)), ("event", self.event_field), ("entity", self.entity_field), ("argument", self.argument_field), ("trigger_pos", self.trigger_pos_field)), ) # convert fields to vocabulary list # self.word_field.build_vocab(self.train_dataset.word, min_freq=min_word_freq) self.event_field.build_vocab(self.train_dataset.event) # create iterator for batch input if args.wv_file: print("start loading embedding") self.wv_model = gensim.models.KeyedVectors.load_word2vec_format( args.wv_file, binary=True) print("done loading embedding") self.embedding_dim = self.wv_model.vector_size word_freq = { word: self.wv_model.wv.vocab[word].count for word in self.wv_model.wv.vocab } word_counter = Counter(word_freq) self.word_field.vocab = Vocab(word_counter, min_freq=args.min_word_freq) # mapping each vector/embedding from word2vec model to word_field vocabs vectors = [] print("start loading vec", len(self.word_field.vocab.stoi)) for word, idx in self.word_field.vocab.stoi.items(): if word in self.wv_model.wv.vocab.keys(): vectors.append( torch.as_tensor(self.wv_model.wv[word].tolist())) else: vectors.append(torch.zeros(self.embedding_dim)) print("done loading vec") del self.wv_model self.word_field.vocab.set_vectors( stoi=self.word_field.vocab.stoi, # list of vector embedding, orderred according to word_field.vocab vectors=vectors, dim=self.embedding_dim) else: self.word_field.build_vocab(self.train_dataset.word, min_freq=args.min_word_freq) self.char_field.build_vocab(self.train_dataset.char) self.entity_field.build_vocab(self.train_dataset.entity) self.argument_field.build_vocab(self.train_dataset.argument) self.trigger_pos_field.build_vocab(self.train_dataset.trigger_pos) self.train_iter, self.val_iter, self.test_iter = BucketIterator.splits( datasets=(self.train_dataset, self.val_dataset, self.test_dataset), batch_size=args.batch_size, shuffle=False, ) # prepare padding index to be ignored during model training/evaluation self.word_pad_idx = self.word_field.vocab.stoi[ self.word_field.pad_token] self.event_pad_idx = self.event_field.vocab.stoi[ self.event_field.pad_token] self.char_pad_idx = self.char_field.vocab.stoi[ self.char_field.pad_token] self.entity_pad_idx = self.entity_field.vocab.stoi[ self.entity_field.pad_token] self.argument_pad_idx = self.entity_field.vocab.stoi[ self.entity_field.pad_token]
def test_inference_performance(): from sklearn.metrics import f1_score from torchtext.datasets import SequenceTaggingDataset from torchtext.data import Field, NestedField WORD = Field(init_token='<bos>', eos_token='<eos>') CHAR_NESTING = Field(tokenize=list, init_token='<bos>', eos_token='<eos>') CHAR = NestedField(CHAR_NESTING, init_token='<bos>', eos_token='<eos>') ENTITY = Field(init_token='<bos>', eos_token='<eos>') data_file = tempfile.NamedTemporaryFile(delete=True) # TODO Need to be decoded in Python 3 data_file.write(requests.get(CORA_URL).content) fields = [(('text', 'char'), (WORD, CHAR))] + [(None, None)] * 22 + [('entity', ENTITY)] dataset = SequenceTaggingDataset(data_file.name, fields, separator=" ") model = Model(model_path='models/neuralParsCit') model.parameters['pre_emb'] = os.path.join(os.getcwd(), 'vectors_with_unk.kv') f = model.build(training=False, **model.parameters) model.reload() word_to_id = {v: i for i, v in model.id_to_word.items()} char_to_id = {v: i for i, v in model.id_to_char.items()} tag_to_id = {tag: i for i, tag in model.id_to_tag.items()} tf = tempfile.NamedTemporaryFile(delete=False) tf.write("\n\n".join( ["\n".join(example.text) for example in dataset.examples])) tf.close() train_sentences = load_sentences(tf.name, model.parameters['lower'], model.parameters['zeros']) train_inputs = prepare_dataset(train_sentences, word_to_id, char_to_id, model.parameters['lower'], True) preds = [] for citation in train_inputs: inputs = create_input(citation, model.parameters, False) y_pred = np.array(f[1](*inputs))[1:-1] preds.append([(w, y_pred[i]) for i, w in enumerate(citation['str_words'])]) assert len(preds) == len(dataset.examples) results = [] for P, T in zip(preds, dataset.examples): for p, t in zip(P, zip(T.text, T.entity)): results.append((p[1], tag_to_id[t[1]])) pred, true = zip(*results) eval_metrics = { 'micro_f1': f1_score(true, pred, average='micro'), 'macro_f1': f1_score(true, pred, average='macro') } data_file.close() assert eval_metrics == pytest.approx({ 'macro_f1': 0.984, 'micro_f1': 0.993 }, abs=0.001)
def nyt_ingredients_ner_dataset(batch_size, use_local=False, root='.data/nyt_ingredients_ner', train_file='train.txt', validation_file='valid.txt', test_file='test.txt', convert_digits=True): """ nyt_ingredients_ner: New York Times Ingredient tagging dataset Extract NYT ingredients dataset using torchtext. Applies GloVe 6B.200d and Char N-gram pretrained vectors. Also sets up per word character Field Parameters: batch_size: Batch size to return from iterator use_local: If True use local provided files (default False) root: Dataset root directory train_file: Train filename validation_file: Validation filename test_file: Test filename convert_digits: If True will convert numbers to single 0's Returns: A dict containing: task: 'nyt_ingredients.ner' iters: (train iter, validation iter, test iter) vocabs: (Inputs word vocabulary, Inputs character vocabulary, Tag vocabulary ) """ # Setup fields with batch dimension first inputs_word = data.Field( init_token="<bos>", eos_token="<eos>", batch_first=True, lower=True, preprocessing=data.Pipeline(lambda w: '0' if convert_digits and w.isdigit() else w)) inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char = data.NestedField(inputs_char_nesting, init_token="<bos>", eos_token="<eos>") labels = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True) fields = ([(('inputs_word', 'inputs_char'), (inputs_word, inputs_char)), ('labels', labels)]) # Load the data if use_local: train, val, test = SequenceTaggingDataset.splits( path=root, train=train_file, validation=validation_file, test=test_file, fields=tuple(fields)) else: train, val, test = Ingredients.splits(fields=tuple(fields)) logger.info('---------- NYT INGREDIENTS NER ---------') logger.info('Train size: %d' % (len(train))) logger.info('Validation size: %d' % (len(val))) logger.info('Test size: %d' % (len(test))) # Build vocab inputs_char.build_vocab(train.inputs_char, val.inputs_char, test.inputs_char) inputs_word.build_vocab(train.inputs_word, val.inputs_word, test.inputs_word, max_size=50000, vectors=[GloVe(name='6B', dim='200'), CharNGram()]) labels.build_vocab(train.labels) logger.info('Input vocab size:%d' % (len(inputs_word.vocab))) logger.info('Tagset size: %d' % (len(labels.vocab))) # Get iterators train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_size=batch_size, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")) train_iter.repeat = False return { 'task': 'nyt_ingredients.ner', 'iters': (train_iter, val_iter, test_iter), 'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab) }
def conll2003_dataset(tag_type, batch_size, root='./conll2003', train_file='eng.train.txt', validation_file='eng.testa.txt', test_file='eng.testb.txt', convert_digits=True): """ conll2003: Conll 2003 (Parser only. You must place the files) Extract Conll2003 dataset using torchtext. Applies GloVe 6B.200d and Char N-gram pretrained vectors. Also sets up per word character Field Parameters: tag_type: Type of tag to pick as task [pos, chunk, ner] batch_size: Batch size to return from iterator root: Dataset root directory train_file: Train filename validation_file: Validation filename test_file: Test filename convert_digits: If True will convert numbers to single 0's Returns: A dict containing: task: 'conll2003.' + tag_type iters: (train iter, validation iter, test iter) vocabs: (Inputs word vocabulary, Inputs character vocabulary, Tag vocabulary ) """ # Setup fields with batch dimension first inputs_word = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, lower=True, preprocessing=data.Pipeline( lambda w: '0' if convert_digits and w.isdigit() else w)) inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char = data.NestedField(inputs_char_nesting, init_token="<bos>", eos_token="<eos>") labels = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True) fields = ([(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))] + [('labels', labels) if label == tag_type else (None, None) for label in ['pos', 'chunk', 'ner']]) # Load the data train, val, test = SequenceTaggingDataset.splits( path=root, train=train_file, validation=validation_file, test=test_file, separator=' ', fields=tuple(fields)) logger.info('---------- CONLL 2003 %s ---------' % tag_type) logger.info('Train size: %d' % (len(train))) logger.info('Validation size: %d' % (len(val))) logger.info('Test size: %d' % (len(test))) # Build vocab inputs_char.build_vocab(train.inputs_char, val.inputs_char, test.inputs_char) inputs_word.build_vocab(train.inputs_word, val.inputs_word, test.inputs_word, max_size=50000, vectors=[GloVe(name='6B', dim='200'), CharNGram()]) labels.build_vocab(train.labels) logger.info('Input vocab size:%d' % (len(inputs_word.vocab))) logger.info('Tagset size: %d' % (len(labels.vocab))) # Get iterators train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_size=batch_size, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")) train_iter.repeat = False return { 'task': 'conll2003.%s' % tag_type, 'iters': (train_iter, val_iter, test_iter), 'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab) }
def conll2000_dataset(batch_size, use_local=False, root='.data/conll2000', train_file='train.txt', test_file='test.txt', validation_frac=0.1, convert_digits=True): """ conll2000: Conll 2000 (Chunking) Extract Conll2000 Chunking dataset using torchtext. By default will fetch data files from online repository. Applies GloVe 6B.200d and Char N-gram pretrained vectors. Also sets up per word character Field Parameters: batch_size: Batch size to return from iterator use_local: If True use local provided files (default False) root (optional): Dataset root directory (needed only if use_local is True) train_file (optional): Train filename (needed only if use_local is True) test_file (optional): Test filename (needed only if use_local is True) validation_frac (optional): Fraction of train dataset to use for validation convert_digits (optional): If True will convert numbers to single 0's NOTE: Since there is only a train and test set we use 10% of the train set as validation Returns: A dict containing: task: 'conll2000.' + tag_type iters: (train iter, validation iter, None) vocabs: (Inputs word vocabulary, Inputs character vocabulary, Tag vocabulary ) """ # Setup fields with batch dimension first inputs_word = data.Field( init_token="<bos>", eos_token="<eos>", batch_first=True, lower=True, preprocessing=data.Pipeline(lambda w: '0' if convert_digits and w.isdigit() else w)) inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char = data.NestedField(inputs_char_nesting, init_token="<bos>", eos_token="<eos>") labels = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True) fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char)), (None, None), ('labels', labels)] if use_local: # Load the data train, test = SequenceTaggingDataset.splits(path=root, train=train_file, test=test_file, fields=tuple(fields)) # HACK: Saving the sort key function as the split() call removes it sort_key = train.sort_key # To make the split deterministic random.seed(0) train, val = train.split(1 - validation_frac, random_state=random.getstate()) # Reset the seed random.seed() # HACK: Set the sort key train.sort_key = sort_key val.sort_key = sort_key else: train, val, test = CoNLL2000Chunking.splits( fields=tuple(fields), validation_frac=validation_frac) logger.info('---------- CONLL 2000 Chunking ---------') logger.info('Train size: %d' % (len(train))) logger.info('Validation size: %d' % (len(val))) logger.info('Test size: %d' % (len(test))) # Build vocab inputs_char.build_vocab(train.inputs_char, val.inputs_char, test.inputs_char) inputs_word.build_vocab(train.inputs_word, val.inputs_word, test.inputs_word, max_size=50000, vectors=[GloVe(name='6B', dim='200'), CharNGram()]) labels.build_vocab(train.labels) logger.info('Input vocab size:%d' % (len(inputs_word.vocab))) logger.info('Tagset size: %d' % (len(labels.vocab))) # Get iterators train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_size=batch_size, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")) train_iter.repeat = False return { 'task': 'conll2000.chunk', 'iters': (train_iter, val_iter, test_iter), 'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab) }