Exemple #1
0
 def __init__(self) -> None:
     super().__init__(lazy=False)
     # todo: could become args
     self._token_indexers = {
         'tokens': SingleIdTokenIndexer()
     }  #'tokens' is the namespace we're using
     self._tokenizer = CharacterTokenizer()
    def __init__(
        self,
        namespace: str = "token_characters",
        character_tokenizer: CharacterTokenizer = CharacterTokenizer(),
        start_tokens: List[str] = None,
        end_tokens: List[str] = None,
        min_padding_length: int = 0,
        token_min_padding_length: int = 0,
    ) -> None:
        super().__init__(token_min_padding_length)
        if min_padding_length == 0:
            url = "https://github.com/allenai/allennlp/issues/1954"
            warnings.warn(
                "You are using the default value (0) of `min_padding_length`, "
                f"which can cause some subtle bugs (more info see {url}). "
                "Strongly recommend to set a value, usually the maximum size "
                "of the convolutional layer size when using CnnEncoder.",
                UserWarning,
            )
        self._min_padding_length = min_padding_length
        self._namespace = namespace
        self._character_tokenizer = character_tokenizer

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]
 def __init__(
     self,
     namespace: str = 'token_characters',
     character_tokenizer: CharacterTokenizer = CharacterTokenizer()
 ) -> None:
     self._namespace = namespace
     self._character_tokenizer = character_tokenizer
Exemple #4
0
 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              lazy=False):
     super().__init__(lazy=lazy)
     self.tokenizer = CharacterTokenizer()
     self.token_indexers = token_indexers or {
         'tokens': SingleIdTokenIndexer()
     }
    def __init__(
            self,
            namespace: str = 'token_characters',
            character_tokenizer: CharacterTokenizer = CharacterTokenizer(),
            start_tokens: List[str] = None,
            end_tokens: List[str] = None) -> None:
        self._namespace = namespace
        self._character_tokenizer = character_tokenizer

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]
Exemple #6
0
def classify(text: str, model: LstmClassifier):
    tokenizer = CharacterTokenizer()
    token_indexers = {'tokens': SingleIdTokenIndexer()}

    tokens = tokenizer.tokenize(text)
    instance = Instance({'tokens': TextField(tokens, token_indexers)})
    logits = model.forward_on_instances([instance])[0]['logits']
    label_id = np.argmax(logits)
    label = model.vocab.get_token_from_index(label_id, 'labels')

    print('text: {}, label: {}'.format(text, label))
Exemple #7
0
def main():
    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=CharacterTokenizer(),
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')})
    train_dataset = reader.read('data/mt/tatoeba.eng_cmn.train.tsv')
    validation_dataset = reader.read('data/mt/tatoeba.eng_cmn.dev.tsv')

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                      min_count={'tokens': 3, 'target_tokens': 3})

    en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=EN_EMBEDDING_DIM)
    # encoder = PytorchSeq2SeqWrapper(
    #     torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
    encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8)

    source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

    # attention = LinearAttention(HIDDEN_DIM, HIDDEN_DIM, activation=Activation.by_name('tanh')())
    # attention = BilinearAttention(HIDDEN_DIM, HIDDEN_DIM)
    attention = DotProductAttention()

    max_decoding_steps = 20   # TODO: make this variable
    model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                          target_embedding_dim=ZH_EMBEDDING_DIM,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True)
    optimizer = optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      num_epochs=1,
                      cuda_device=CUDA_DEVICE)

    for i in range(50):
        print('Epoch: {}'.format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)

        for instance in itertools.islice(validation_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
    def test_count_vocab_items_respects_casing(self):
        indexer = TokenCharactersIndexer("characters")
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)
        assert counter["characters"] == {"h": 1, "H": 1, "e": 2, "l": 4, "o": 2}

        indexer = TokenCharactersIndexer("characters", CharacterTokenizer(lowercase_characters=True))
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)
        assert counter["characters"] == {"h": 2, "e": 2, "l": 4, "o": 2}
Exemple #9
0
def read_shakespeare(all_chars: Set[str] = None) -> List[List[Token]]:
    tokenizer = CharacterTokenizer()
    sentences = []
    with open('data/shakespeare/hamlet.txt') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            line = re.sub(' +', ' ', line)
            tokens = tokenizer.tokenize(line)
            if all_chars:
                tokens = [token for token in tokens if token.text in all_chars]
            sentences.append(tokens)

    return sentences
Exemple #10
0
def get_squad_from_rawfile(squad_file_path, need_char=False):
    '''
    Args:
        squad_file_path -- a raw squad.json path
        need_char --
    Returns:
        dataset -- [], each item is a SquadData object, which contains
                   passage, question, span_start, span_end. Tokenized
    '''
    namespace_word = "word2idx"
    token_indexers = {
            "tokens": SingleIdTokenIndexer(namespace=namespace_word)}
    reader = SquadReader(token_indexers=token_indexers)
    instances = reader.read(squad_file_path)
    dataset = []
    for instance in instances:
        passage = instance.fields['passage'].tokens
        passage = [word.text for word in passage]
        question = instance.fields['question'].tokens
        question = [word.text for word in question]
        span_start = instance.fields['span_start'].sequence_index
        span_end = instance.fields['span_end'].sequence_index
        item = SquadData(passage, question, span_start, span_end)
        meta = instance.fields['metadata'].metadata
        item.question_id = meta['question_id']
        item.passage_text = meta['original_passage']
        item.question_text = meta['question_text']
        item.answer_text = meta['answer_texts'][0]
        item.passage_token_offsets = meta['token_offsets']
        dataset.append(item)
    print ("read {} datas from {}".format(len(dataset), squad_file_path))
    if need_char == True:
        char_tokenizer = CharacterTokenizer()
        # add char tokenizers
        for squad in dataset:
            passage_char = char_tokenizer.batch_tokenize(squad.passage)
            question_char = char_tokenizer.batch_tokenize(squad.question)
            squad.passage_char = [[ch.text for ch in chars] for chars in passage_char]
            squad.question_char = [[ch.text for ch in chars] for chars in question_char]
        print ("add passage_char, question_char for each squad object")
    return dataset
Exemple #11
0
from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper, StackedSelfAttentionEncoder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.predictors import SimpleSeq2SeqPredictor
from allennlp.training.trainer import Trainer

EN_EMBEDDING_DIM = 256
ZH_EMBEDDING_DIM = 256
HIDDEN_DIM = 256

CUDA_DEVICE = 0

#def main():
reader = Seq2SeqDatasetReader(
    source_tokenizer=WordTokenizer(),
    target_tokenizer=CharacterTokenizer(),
    source_token_indexers={'tokens': SingleIdTokenIndexer()},
    target_token_indexers={
        'tokens': SingleIdTokenIndexer(namespace='target_tokens')
    })
train_dataset = reader.read('/.../en_el_train.txt')
validation_dataset = reader.read('/.../en_el_dev.txt')

vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                  min_count={
                                      'tokens': 3,
                                      'target_tokens': 3
                                  })

en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                         embedding_dim=EN_EMBEDDING_DIM)
Exemple #12
0
def main():
	###############################################################################################
	prepare_global_logging(serialization_dir=args.serialization_dir, file_friendly_logging=False)
	#DATA
	reader = MathDatasetReader(source_tokenizer=CharacterTokenizer(),
	                        target_tokenizer=CharacterTokenizer(),
	                        source_token_indexers={'tokens': SingleIdTokenIndexer(namespace='tokens')},
	                        target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='tokens')},
	                        target=False,
	                        label=True,
	                        lazy=False)
	# train_data = reader.read("../../datasets/math/label-data/train-all")
	# val_data = reader.read("../../datasets/math/label-data/interpolate")
	val_data = reader.read("./generate_files")


	vocab = Vocabulary()
	vocab.add_tokens_to_namespace([START_SYMBOL, END_SYMBOL, ' ', '!', "'", '(', ')', '*', '+', ',', '-', '.', '/',
	                                    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?',
	                                    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
	                                    'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b',
	                                    'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
	                                    'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}'], namespace='tokens')
	vocab.add_tokens_to_namespace(['algebra', 'arithmetic', 'calculus', 'comparison',
	  								 'measurement', 'numbers', 'polynomials', 'probability'], namespace='labels')



	# MODEL
	embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
	                             embedding_dim=EMBEDDING_DIM)
	source_embedder = BasicTextFieldEmbedder({"tokens": embedding})

	if args.model == 'lstm':
		encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, 
											num_layers=NUM_LAYERS, batch_first=True))
	elif args.model == 'cnn':
		encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM, num_filters=NUM_FILTERS, output_dim=HIDDEN_DIM)
	else:
		raise NotImplemented("The classifier model should be LSTM or CNN")


	model = TextClassifier(vocab=vocab,
				source_text_embedder=source_embedder,
	            encoder=encoder,
	            )
	model.to(device)


	if not Path(args.serialization_dir).exists() or not Path(args.serialization_dir).is_dir():
  		raise NotImplementedError("The model seems not to exist")
	with open(Path(args.serialization_dir) / "best.th", "rb") as model_path:
  		model_state = torch.load(model_path, map_location=nn_util.device_mapping(-1))
  		model.load_state_dict(model_state)
	model.eval()

	predictor = TextClassifierPredictor(model, dataset_reader=reader)

	# TEST
	correct = 0
	total = 0

	pbar = tqdm(val_data)
	batch_instance = list()
	batch_gt = list()

	idx_last = 0
	for idx, instance in enumerate(pbar):
		if idx != (idx_last + BATCH_SIZE):
			batch_instance.append(instance)
			batch_gt.append(instance.fields["labels"].label) # str
		else:
			idx_last = idx
			outputs = predictor.predict(batch_instance)
			for i, output in enumerate(outputs):
				if batch_gt[i] == output['predict_labels']:
					correct += 1
				total += 1
			batch_instance = list()
			batch_gt = list()
			pbar.set_description("correct/total %.3f" % (correct / total))
Exemple #13
0
def main():
    ###############################################################################################
    prepare_global_logging(serialization_dir=args.serialization_dir,
                           file_friendly_logging=False)
    #DATA
    reader = MathDatasetReader(source_tokenizer=CharacterTokenizer(),
                               target_tokenizer=CharacterTokenizer(),
                               source_token_indexers={
                                   'tokens':
                                   SingleIdTokenIndexer(namespace='tokens')
                               },
                               target_token_indexers={
                                   'tokens':
                                   SingleIdTokenIndexer(namespace='tokens')
                               },
                               target=False,
                               label=True,
                               lazy=True)
    train_data = reader.read("../../datasets/math/label-data/train-all")
    # val_data = reader.read("../../datasets/math/label-data/interpolate")

    vocab = Vocabulary()
    vocab.add_tokens_to_namespace([
        START_SYMBOL, END_SYMBOL, ' ', '!', "'", '(', ')', '*', '+', ',', '-',
        '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<',
        '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
        'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y',
        'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
        'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{',
        '}'
    ],
                                  namespace='tokens')
    vocab.add_tokens_to_namespace([
        'algebra', 'arithmetic', 'calculus', 'comparison', 'measurement',
        'numbers', 'polynomials', 'probability'
    ],
                                  namespace='labels')

    # MODEL
    embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                          embedding_dim=EMBEDDING_DIM)
    source_embedder = BasicTextFieldEmbedder({"tokens": embedding})

    if args.model == 'lstm':
        encoder = PytorchSeq2VecWrapper(
            torch.nn.LSTM(EMBEDDING_DIM,
                          HIDDEN_DIM,
                          num_layers=NUM_LAYERS,
                          batch_first=True))
    elif args.model == 'cnn':
        encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM,
                             num_filters=NUM_FILTERS,
                             output_dim=HIDDEN_DIM)
    else:
        raise NotImplemented("The classifier model should be LSTM or CNN")

    model = TextClassifier(
        vocab=vocab,
        source_text_embedder=source_embedder,
        encoder=encoder,
    )
    model.to(device)

    optimizer = optim.Adam(model.parameters(),
                           lr=1e-3,
                           betas=(0.9, 0.995),
                           eps=1e-6)

    train_iterator = BucketIterator(batch_size=BATCH_SIZE,
                                    max_instances_in_memory=1024,
                                    sorting_keys=[("source_tokens",
                                                   "num_tokens")])
    train_iterator = MultiprocessIterator(train_iterator, num_workers=16)
    train_iterator.index_with(vocab)

    val_iterator = BucketIterator(batch_size=BATCH_SIZE,
                                  max_instances_in_memory=1024,
                                  sorting_keys=[("source_tokens", "num_tokens")
                                                ])
    val_iterator = MultiprocessIterator(val_iterator, num_workers=16)
    val_iterator.index_with(vocab)
    #pdb.set_trace()

    LR_SCHEDULER = {"type": "exponential", "gamma": 0.5, "last_epoch": -1}
    lr_scheduler = LearningRateScheduler.from_params(optimizer,
                                                     Params(LR_SCHEDULER))

    # TRAIN
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=train_iterator,
                      validation_iterator=None,
                      train_dataset=train_data,
                      validation_dataset=None,
                      patience=None,
                      shuffle=True,
                      num_epochs=1,
                      summary_interval=100,
                      learning_rate_scheduler=lr_scheduler,
                      cuda_device=CUDA_DEVICES,
                      grad_norm=5,
                      grad_clipping=5,
                      model_save_interval=600,
                      serialization_dir=args.serialization_dir,
                      keep_serialized_model_every_num_seconds=3600,
                      should_log_parameter_statistics=True,
                      should_log_learning_rate=True)
    trainer.train()
from allennlp.data.tokenizers.character_tokenizer import CharacterTokenizer
from allennlp.data.tokenizers.word_tokenizer import WordTokenizer
from allennlp.data.token_indexers import SingleIdTokenIndexer

data_reader_configs = {
    'debug': None,
    'ud-eng': None,
    'nc_zhen': {
        'source_tokenizer': WordTokenizer(),
        'target_tokenizer': CharacterTokenizer(),
        'source_token_indexers': {
            'tokens': SingleIdTokenIndexer()
        },
        'target_token_indexers': {
            'tokens': SingleIdTokenIndexer(namespace='target_tokens')
        }
    },
    'wikitext': {}
}


def get_datareader_configs(dataset_name):
    return data_reader_configs.get(dataset_name)
 def __init__(self,
              namespace=u'token_characters',
              character_tokenizer=CharacterTokenizer()):
     self._namespace = namespace
     self._character_tokenizer = character_tokenizer