def __init__(self, vocab, use_postags_only=True, embed_dim=100, hidden_size=200, recurrent_dropout_probability=0.3, use_highway=False, maxpool=True): super(BLSTMModel, self).__init__() self.embeds = Embedding.from_params( vocab, Params({'vocab_namespace': 'pos' if use_postags_only else 'tokens', 'embedding_dim': embed_dim, 'trainable': True, 'padding_index': 0, 'pretrained_file': None if use_postags_only else 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz', })) self.binary_feature_embedding = Embedding(2, embed_dim) self.fwd_lstm = PytorchSeq2SeqWrapper(AugmentedLstm( input_size=embed_dim * 2, hidden_size=hidden_size, go_forward=True, recurrent_dropout_probability=recurrent_dropout_probability, use_input_projection_bias=False, use_highway=use_highway), stateful=False) self.bwd_lstm = PytorchSeq2SeqWrapper(AugmentedLstm( input_size=embed_dim * 2, hidden_size=hidden_size, go_forward=False, recurrent_dropout_probability=recurrent_dropout_probability, use_input_projection_bias=False, use_highway=use_highway), stateful=False) self.maxpool = maxpool self.fc = nn.Linear(hidden_size * 2, 1, bias=False)
def __init__(self, vocab: Vocabulary, recurrent_dropout_probability: float = 0.0, embedding_dropout_probability: float = 0.0, input_size=512, hidden_size=512) -> None: """ :param options_file: for initializing elmo BiLM :param weight_file: for initializing elmo BiLM :param requires_grad: Whether or not to finetune the LSTM layers :param recurrent_dropout_probability: recurrent dropout to add to LSTM layers """ super(SimpleBiLM, self).__init__() self.forward_lm = PytorchSeq2SeqWrapper(StackedLstm( input_size=input_size, hidden_size=hidden_size, num_layers=2, go_forward=True, recurrent_dropout_probability=recurrent_dropout_probability, use_input_projection_bias=False, use_highway=True), stateful=True) self.reverse_lm = PytorchSeq2SeqWrapper(StackedLstm( input_size=input_size, hidden_size=hidden_size, num_layers=2, go_forward=False, recurrent_dropout_probability=recurrent_dropout_probability, use_input_projection_bias=False, use_highway=True), stateful=True) # This will also be the encoder self.decoder = torch.nn.Linear( 512, vocab.get_vocab_size(namespace='tokens')) self.vocab = vocab self.register_buffer( 'eos_tokens', torch.LongTensor([ vocab.get_token_index(tok) for tok in [ '.', '!', '?', '@@UNKNOWN@@', '@@PADDING@@', '@@bos@@', '@@eos@@' ] ])) self.register_buffer( 'invalid_tokens', torch.LongTensor([ vocab.get_token_index(tok) for tok in [ '@@UNKNOWN@@', '@@PADDING@@', '@@bos@@', '@@eos@@', '@@NEWLINE@@' ] ])) self.embedding_dropout_probability = embedding_dropout_probability
def setUp(self): self.reader = ToyReader() self.train_instances = self.reader.read("/home/IAIS/nchakrabor/nmt_data/toy_reverse/train/toy_train.txt") self.dev_instances = self.reader.read("/home/IAIS/nchakrabor/nmt_data/toy_reverse/dev/toy_dev.txt") self.vocab = Vocabulary.from_instances(self.train_instances + self.dev_instances) token_embedding = Embedding(num_embeddings=self.vocab.get_vocab_size('tokens') + 2, embedding_dim=256, padding_index=0) word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2SeqWrapper(nn.LSTM(input_size=word_embeddings.get_output_dim(), num_layers=2, hidden_size=256, bidirectional=True, dropout=0.4, batch_first=True)) # self.set_up_model(model_params_file_path, dataset_sample_file_path) self.model = SimpleSeq2Seq(vocab=self.vocab, source_embedder=word_embeddings, encoder=encoder, target_embedding_dim=256, target_namespace='target_tokens', attention=DotProductAttention(), max_decoding_steps=25, beam_size=5, use_bleu=True ) self.model.cuda(0)
def __init__(self): # CopyNet model initialization parameters self.vocabulary = Vocabulary() self.vocabulary = self.vocabulary.from_files( "C:/Users/Selma/PycharmProjects/ROS2SemanticParser/" "CN_model_weights/no_embedds/model.tar.gz") self.source_embedder = BasicTextFieldEmbedder( token_embedders={ 'tokens': Embedding(num_embeddings=self.vocabulary.get_vocab_size( 'source_tokens'), embedding_dim=310) }) self.dataset_reader = CopyNetDatasetReader( target_namespace="target_tokens") self.encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(input_size=310, hidden_size=128, num_layers=1, batch_first=True)) self.attention = BilinearAttention(vector_dim=128, matrix_dim=128) self.beam_size = 5 self.max_decoding_steps = 200 self.target_embedding_dim = 150 self.semantic_parser = CopyNetSeq2Seq( vocab=self.vocabulary, source_embedder=self.source_embedder, encoder=self.encoder, attention=self.attention, beam_size=self.beam_size, max_decoding_steps=self.max_decoding_steps, target_embedding_dim=self.target_embedding_dim)
def from_params(self, params: Params) -> PytorchSeq2SeqWrapper: if not params.pop_bool('batch_first', True): raise ConfigurationError("Our encoder semantics assumes batch is always first!") if self._module_class in self.PYTORCH_MODELS: params['batch_first'] = True module = self._module_class(**params.as_dict()) return PytorchSeq2SeqWrapper(module)
def from_params(cls, params: Params) -> 'PytorchSeq2SeqWrapper': input_size = params.pop("input_size") hidden_size = params.pop("hidden_size") cell_params = params.pop("cell") cell = Cell.from_params(cell_params) return PytorchSeq2SeqWrapper( cls(input_size=input_size, hidden_size=hidden_size, cell=cell))
def from_params(self, params: Params, **extras) -> PytorchSeq2SeqWrapper: if not params.pop_bool('batch_first', True): raise ConfigurationError( "Our encoder semantics assumes batch is always first!") if self._module_class in self.PYTORCH_MODELS: params['batch_first'] = True stateful = params.pop_bool('stateful', False) weight_dropout = params.pop_float('weight_dropout', 0.0) variational = params.pop_float('variational', True) num_layers = params.get('num_layers', 1) bidirectional = params.get('bidirectional', False) all_recurrent_weights = [ f"weight_hh_l{layer}{suffix}" for layer, suffix in product(range(num_layers), [""] + ["_reverse"] * (1 if bidirectional else 0)) ] if weight_dropout > 0.0: module = weight_drop_factory(self._module_class)( module_args=params.as_dict(infer_type_and_cast=True), weights=all_recurrent_weights, wdrop=weight_dropout, variational=variational, ) else: module = self._module_class(**params.as_dict( infer_type_and_cast=True)) return PytorchSeq2SeqWrapper(module, stateful=stateful)
def get_encoder(st_ds_conf: dict): emb_sz = st_ds_conf['emb_sz'] if st_ds_conf['encoder'] == 'lstm': encoder = StackedEncoder( [ PytorchSeq2SeqWrapper( torch.nn.LSTM(emb_sz, emb_sz, batch_first=True)) for _ in range(st_ds_conf['num_enc_layers']) ], emb_sz, emb_sz, input_dropout=st_ds_conf['intermediate_dropout']) elif st_ds_conf['encoder'] == 'bilstm': encoder = StackedEncoder( [ PytorchSeq2SeqWrapper( torch.nn.LSTM( emb_sz, emb_sz, batch_first=True, bidirectional=True)) ] + [ PytorchSeq2SeqWrapper( torch.nn.LSTM(emb_sz * 2, emb_sz, batch_first=True, bidirectional=True)) for _ in range(st_ds_conf['num_enc_layers'] - 1) ], emb_sz, emb_sz * 2, input_dropout=st_ds_conf['intermediate_dropout']) elif st_ds_conf['encoder'] == 'transformer': encoder = StackedEncoder([ TransformerEncoder( input_dim=emb_sz, num_layers=st_ds_conf['num_enc_layers'], num_heads=st_ds_conf['num_heads'], feedforward_hidden_dim=emb_sz, feedforward_dropout=st_ds_conf['feedforward_dropout'], residual_dropout=st_ds_conf['residual_dropout'], attention_dropout=st_ds_conf['attention_dropout'], ) for _ in range(st_ds_conf['num_enc_layers']) ], emb_sz, emb_sz, input_dropout=0.) else: assert False return encoder
def from_params(self, params: Params) -> PytorchSeq2SeqWrapper: if not params.pop_bool("batch_first", True): raise ConfigurationError("Our encoder semantics assumes batch is always first!") if self._module_class in self.PYTORCH_MODELS: params["batch_first"] = True stateful = params.pop_bool("stateful", False) module = self._module_class(**params.as_dict(infer_type_and_cast=True)) return PytorchSeq2SeqWrapper(module, stateful=stateful)
def __init__(self, input_dim: int, combination: str = "x,y", num_width_embeddings: int = None, span_width_embedding_dim: int = None, bucket_widths: bool = False, use_exclusive_start_indices: bool = False) -> None: super().__init__() self._input_dim = input_dim self._combination = combination self._encoder = PytorchSeq2SeqWrapper( StackedBidirectionalLstm(self._input_dim, int(floor(self._input_dim / 2)), 1)) self._span_extractor = BidirectionalEndpointSpanExtractor( self._input_dim, "y", "y", num_width_embeddings, span_width_embedding_dim, bucket_widths)
def __init__(self, input_dropout, pretrained, average_pool, semantic, final_dim, backbone, head): super().__init__() self.detector = SimpleDetector(pretrained=pretrained, average_pool=average_pool, semantic=semantic, final_dim=final_dim) self.rnn_input_dropout = TimeDistributed( InputVariationalDropout( input_dropout)) if input_dropout > 0 else None self.encoder_model = TimeDistributed( PytorchSeq2SeqWrapper( torch.nn.LSTM(1280, 256, batch_first=True, bidirectional=True))) self.backbone = build_backbone(backbone) # self.combine_model = build_combine_layer(combine_model) ###combine text and image self.head = build_head(head)
def __init__(self, vocab: Vocabulary, span_emb_dim: int, tree_prop: int = 1, tree_dropout: float = 0.0, tree_children: str = 'attention', initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(Tree, self).__init__(vocab, regularizer) self._span_emb_dim = span_emb_dim assert span_emb_dim % 2 == 0 self._f_network = FeedForward(input_dim=2 * span_emb_dim, num_layers=1, hidden_dims=span_emb_dim, activations=torch.nn.Sigmoid(), dropout=0) self._tree_prop = tree_prop self._tree_children = tree_children if self._tree_children == 'attention': self._global_attention = TimeDistributed( torch.nn.Linear(span_emb_dim, 1)) elif self._tree_children == 'pooling': pass elif self._tree_children == 'conv': self._conv = torch.nn.Conv1d(span_emb_dim, span_emb_dim, kernel_size=3, padding=1) elif self._tree_children == 'rnn': self._encoder = PytorchSeq2SeqWrapper( StackedBidirectionalLstm(span_emb_dim, int(floor(span_emb_dim / 2)), 1)) else: raise RuntimeError('invalid tree_children option: {}'.format( self._tree_children)) self._dropout = torch.nn.Dropout(p=tree_dropout) initializer(self)
def __init__(self, pretrained=True, average_pool=True, semantic=True, final_dim=512, input_dropout=0.3, reasoning_use_obj=True, reasoning_use_answer=True, reasoning_use_question=True, pool_reasoning=True, pool_answer=True, pool_question=True): super().__init__() # self.detector = SimpleDetector(pretrained=pretrained, # average_pool=average_pool, semantic=semantic, final_dim=final_dim) self.reasoning_encoder = TimeDistributed( PytorchSeq2SeqWrapper( torch.nn.LSTM(1536, 256, num_layers=2, batch_first=True, bidirectional=True))) self.rnn_input_dropout = TimeDistributed( InputVariationalDropout( input_dropout)) if input_dropout > 0 else None self.span_attention = BilinearMatrixAttention( matrix_1_dim=final_dim, matrix_2_dim=final_dim, ) self.obj_attention = BilinearMatrixAttention( matrix_1_dim=final_dim, matrix_2_dim=final_dim, ) self.reasoning_use_obj = reasoning_use_obj self.reasoning_use_answer = reasoning_use_answer self.reasoning_use_question = reasoning_use_question self.pool_reasoning = pool_reasoning self.pool_answer = pool_answer self.pool_question = pool_question InitializerApplicator(self)
def allennlp_seq2seq(c, num_layers, input, hidden, cell, batch, timestep, repeat, cuda, output): num_layers = int(num_layers) input = int(input) hidden = int(hidden) cell = int(cell) batch = int(batch) timestep = int(timestep) repeat = int(repeat) lstms = [] lstm_input = input for _ in range(num_layers): lstms.append( PytorchSeq2SeqWrapper(AugmentedLstm( input_size=lstm_input, hidden_size=hidden, use_highway=False, use_input_projection_bias=False, ), stateful=True)) lstm_input = hidden input_tensor = torch.rand(batch, timestep, input) if cuda == 'cuda': input_tensor = input_tensor.cuda() lstms = [l.cuda() for l in lstms] durations = [] for idx in range(repeat): batch_lengths = [timestep] batch_lengths.extend( [random.randrange(timestep + 1) for _ in range(batch - 1)]) batch_lengths = sorted(batch_lengths, reverse=True) mask = torch.zeros(batch, timestep, dtype=torch.long) for mask_idx, length in enumerate(batch_lengths): mask[mask_idx, :length] = 1 if cuda == 'cuda': mask = mask.cuda() with torch.no_grad(): time_start = time.time() lstm_input = input_tensor for lstm in lstms: lstm_input = lstm( lstm_input, mask, ) durations.append((idx, time.time() - time_start), ) with open(output, 'w') as fout: json.dump( { 'type': 'allennlp_seq2seq', 'cuda': cuda, 'durations': durations }, fout, ensure_ascii=False, indent=2, )
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, char_field_embedder: TextFieldEmbedder, # num_highway_layers: int, phrase_layer: Seq2SeqEncoder, char_rnn: Seq2SeqEncoder, hops: int, hidden_dim: int, dropout: float = 0.2, mask_lstms: bool = True, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(BidirectionalAttentionFlow, self).__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder self._char_field_embedder = char_field_embedder self._features_embedder = nn.Embedding(2, 5) # self._highway_layer = TimeDistributed(Highway(text_field_embedder.get_output_dim() + 5 * 3, # num_highway_layers)) self._phrase_layer = phrase_layer self._encoding_dim = phrase_layer.get_output_dim() # self._stacked_brnn = PytorchSeq2SeqWrapper( # StackedBidirectionalLstm(input_size=self._encoding_dim, hidden_size=hidden_dim, # num_layers=3, recurrent_dropout_probability=0.2)) self._char_rnn = char_rnn self.hops = hops self.interactive_aligners = nn.ModuleList() self.interactive_SFUs = nn.ModuleList() self.self_aligners = nn.ModuleList() self.self_SFUs = nn.ModuleList() self.aggregate_rnns = nn.ModuleList() for i in range(hops): # interactive aligner self.interactive_aligners.append( layers.SeqAttnMatch(self._encoding_dim)) self.interactive_SFUs.append( layers.SFU(self._encoding_dim, 3 * self._encoding_dim)) # self aligner self.self_aligners.append(layers.SelfAttnMatch(self._encoding_dim)) self.self_SFUs.append( layers.SFU(self._encoding_dim, 3 * self._encoding_dim)) # aggregating self.aggregate_rnns.append( PytorchSeq2SeqWrapper( nn.LSTM(input_size=self._encoding_dim, hidden_size=hidden_dim, num_layers=1, dropout=0.2, bidirectional=True, batch_first=True))) # Memmory-based Answer Pointer self.mem_ans_ptr = layers.MemoryAnsPointer(x_size=self._encoding_dim, y_size=self._encoding_dim, hidden_size=hidden_dim, hop=hops, dropout_rate=0.2, normalize=True) self._span_start_accuracy = CategoricalAccuracy() self._span_end_accuracy = CategoricalAccuracy() self._span_yesno_accuracy = CategoricalAccuracy() self._span_accuracy = BooleanAccuracy() self._squad_metrics = SquadEmAndF1() if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x self._mask_lstms = mask_lstms initializer(self)
def setUp(self): self.sample_only = False # self.setupstubexecutor() model_params_file_path = self.TEST_DATA_ROOT / "experiment.json" self.dataset_sample_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v3.deurified.simple.sample.json" self.dataset_train_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v3.train.json" self.dataset_test_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v3.test.json" predicates_file_path = self.TEST_DATA_ROOT / "properties.txt" with codecs.open(predicates_file_path) as fp: self.predicates = [i.strip() for i in fp] dbo_classes = set([ dbo for dbo in self.predicates if dbo.split("/")[-1][0].isupper() ]) binary_predicates = set(self.predicates) - dbo_classes if self.sample_only: self.sample_reader = LCQuADReaderSimple( predicates=binary_predicates, ontology_types=dbo_classes) else: self.train_reader = LCQuADReaderSimple( predicates=binary_predicates, ontology_types=dbo_classes) # self.test_reader = LCQuADReaderSimple(predicates=binary_predicates, ontology_types=dbo_classes) # sample_reader.cache_data("sample_dataset") # train_reader.cache_data("train_dataset") # test_reader.cache_data("test_dataset") if self.sample_only: self.sample_instances = list( self.sample_reader.read(str(self.dataset_sample_file_path))) else: self.train_instances = list( self.train_reader.read(str(self.dataset_train_file_path))) self.test_instances = list( self.train_reader.read(str(self.dataset_test_file_path))) if self.sample_only: self.vocab = Vocabulary.from_instances(self.sample_instances) else: self.vocab = Vocabulary.from_instances(self.train_instances + self.test_instances, min_count={ 'tokens': 3, 'target_tokens': 3 }) #min_count={'tokens': 3, 'target_tokens': 3}) #self.vocab = Vocabulary() token_embedding = Embedding( num_embeddings=self.vocab.get_vocab_size('tokens') + 2, embedding_dim=512, padding_index=0) #options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' #weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' # the embedder maps the input tokens to the appropriate embedding matrix #elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) #word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder( {"tokens": token_embedding}) encoder = PytorchSeq2SeqWrapper( nn.LSTM(input_size=word_embeddings.get_output_dim(), num_layers=2, hidden_size=256, bidirectional=True, dropout=0.5, batch_first=True)) val_outputs = self.TEST_DATA_ROOT / "val_outputs.seq2seq.json" self.val_outputs_fp = codecs.open(val_outputs, 'w') # self.set_up_model(model_params_file_path, dataset_sample_file_path) self.model = SimpleSeq2Seq(vocab=self.vocab, source_embedder=word_embeddings, encoder=encoder, target_embedding_dim=128, target_namespace='target_tokens', attention=DotProductAttention(), max_decoding_steps=25, beam_size=5, use_bleu=True, scheduled_sampling_ratio=0.3) self.model.cuda(0)
def __init__(self, conf: Dict, input_batchers: Dict[str, Union[WordBatch, CharacterBatch]], n_class: int, use_cuda: bool): super(SeqLabelModel, self).__init__() self.n_class = n_class self.use_cuda = use_cuda self.input_dropout = torch.nn.Dropout2d(p=conf["dropout"]) self.dropout = InputVariationalDropout(p=conf['dropout']) input_layers = {} for i, c in enumerate(conf['input']): if c['type'] == 'embeddings': if 'pretrained' in c: embs = load_embedding_txt(c['pretrained'], c['has_header']) logger.info('loaded {0} embedding entries.'.format( len(embs[0]))) else: embs = None name = c['name'] mapping = input_batchers[name].mapping layer = Embeddings(c['dim'], mapping, fix_emb=c['fixed'], embs=embs, normalize=c.get('normalize', False), input_field_name=name) logger.info('embedding for field {0} ' 'created with {1} x {2}.'.format( c['field'], layer.n_V, layer.n_d)) input_layers[name] = layer elif c['type'] == 'cnn_encoder' or c['type'] == 'lstm_encoder': name = c['name'] mapping = input_batchers[name].mapping embeddings = Embeddings( c['dim'], mapping, fix_emb=False, embs=None, normalize=False, input_field_name='{0}_ch_emb'.format(name)) logger.info('character embedding for field {0} ' 'created with {1} x {2}.'.format( c['field'], embeddings.n_V, embeddings.n_d)) if c['type'] == 'lstm_encoder': layer = LstmTokenEmbedder(c['dim'], embeddings, conf['dropout'], use_cuda, input_field_name=name) elif c['type'] == 'cnn_encoder': layer = ConvTokenEmbedder(c['dim'], embeddings, c['filters'], c.get('n_highway', 1), c.get('activation', 'relu'), use_cuda, input_field_name=name) else: raise ValueError('Unknown type: {}'.format(c['type'])) input_layers[name] = layer elif c['type'] == 'elmo': name = c['name'] layer = ContextualizedWordEmbeddings(name, c['path'], use_cuda) input_layers[name] = layer else: raise ValueError('{} unknown input layer'.format(c['type'])) self.input_layers = torch.nn.ModuleDict(input_layers) input_encoders = [] input_dim = 0 for i, c in enumerate(conf['input_encoder']): input_info = { name: self.input_layers[name].get_output_dim() for name in c['input'] } if c['type'] == 'affine': input_encoder = AffineTransformInputEncoder( input_info, c['dim'], use_cuda) elif c['type'] == 'sum': input_encoder = SummationInputEncoder(input_info, use_cuda) elif c['type'] == 'concat': input_encoder = ConcatenateInputEncoder(input_info, use_cuda) else: raise ValueError('{} unknown input encoder'.format(c['type'])) input_dim += input_encoder.get_output_dim() input_encoders.append(input_encoder) self.input_encoders = torch.nn.ModuleList(input_encoders) encoder_name = conf['encoder']['type'].lower() if encoder_name == 'stacked_bidirectional_lstm': lstm = StackedBidirectionalLstm( input_size=input_dim, hidden_size=conf['encoder']['hidden_dim'], num_layers=conf['encoder']['n_layers'], recurrent_dropout_probability=conf['dropout'], layer_dropout_probability=conf['dropout'], use_highway=conf['encoder'].get('use_highway', True)) self.encoder = PytorchSeq2SeqWrapper(lstm, stateful=False) encoded_input_dim = self.encoder.get_output_dim() elif encoder_name == 'project': self.encoder = ProjectedEncoder(input_dim, conf['encoder']['hidden_dim'], dropout=conf['dropout']) encoded_input_dim = self.encoder.get_output_dim() elif encoder_name == 'dummy': self.encoder = DummyEncoder() encoded_input_dim = input_dim else: raise ValueError('Unknown input encoder: {}'.format(encoder_name)) if conf["classifier"]["type"].lower() == 'crf': self.classify_layer = CRFLayer(encoded_input_dim, n_class, use_cuda) else: self.classify_layer = ClassifyLayer(encoded_input_dim, n_class, use_cuda) self.encode_time = 0 self.emb_time = 0 self.classify_time = 0
def setUp(self): self.sample_only = False self.setUpExecutor() # self.setupstubexecutor() model_params_file_path = self.TEST_DATA_ROOT / "experiment.json" self.dataset_sample_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v2.deurified.sample.json" self.dataset_train_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v2.deurified.train.json" self.dataset_test_file_path = self.TEST_DATA_ROOT / "lcquad.annotated.lisp.v2.deurified.test.json" predicates_file_path = self.TEST_DATA_ROOT / "properties.txt" with codecs.open(predicates_file_path) as fp: self.predicates = [i.strip() for i in fp] dbo_classes = set([ dbo for dbo in self.predicates if dbo.split("/")[-1][0].isupper() ]) binary_predicates = set(self.predicates) - dbo_classes token_indexer = None #{'tokens': ELMoTokenCharactersIndexer()} if self.sample_only: sample_reader = LCQuADReader(executor=self.executor, predicates=binary_predicates, token_indexers=token_indexer, ontology_types=dbo_classes) else: train_reader = LCQuADReader(executor=self.executor, predicates=binary_predicates, token_indexers=token_indexer, ontology_types=dbo_classes) test_reader = LCQuADReader(executor=self.executor, predicates=binary_predicates, token_indexers=token_indexer, ontology_types=dbo_classes) # sample_reader.cache_data("sample_dataset") # train_reader.cache_data("train_dataset") # test_reader.cache_data("test_dataset") if self.sample_only: self.sample_instances = list( sample_reader.read(str(self.dataset_sample_file_path))) else: self.train_instances = list( train_reader.read(str(self.dataset_train_file_path))) self.test_instances = list( test_reader.read(str(self.dataset_test_file_path))) if self.sample_only: self.vocab = Vocabulary.from_instances(self.sample_instances) else: self.vocab = Vocabulary.from_instances(self.train_instances + self.test_instances) #self.vocab = Vocabulary() token_embedding = Embedding( num_embeddings=self.vocab.get_vocab_size() + 2, embedding_dim=256, padding_index=0) #options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' #weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' # the embedder maps the input tokens to the appropriate embedding matrix #elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) #word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder( {"tokens": token_embedding}) encoder = PytorchSeq2SeqWrapper( nn.LSTM( input_size=word_embeddings.get_output_dim(), num_layers=1, hidden_size=128, bidirectional=True, # dropout=0.4, batch_first=True)) val_outputs = self.TEST_DATA_ROOT / "val_outputs.json" self.val_outputs_fp = codecs.open(val_outputs, 'w') # self.set_up_model(model_params_file_path, dataset_sample_file_path) self.model = LCQuADMmlSemanticParser( vocab=self.vocab, sentence_embedder=word_embeddings, action_embedding_dim=256, encoder=encoder, attention=DotProductAttention(), decoder_beam_search=BeamSearch(beam_size=1), max_decoding_steps=50, dropout=0.5, val_outputs=self.val_outputs_fp) self.model.cuda(0)
def get_model(vocab, st_ds_conf): emb_sz = st_ds_conf['emb_sz'] source_embedding = allennlp.modules.Embedding( num_embeddings=vocab.get_vocab_size('nltokens'), embedding_dim=emb_sz) target_embedding = allennlp.modules.Embedding( num_embeddings=vocab.get_vocab_size('lftokens'), embedding_dim=emb_sz) if st_ds_conf['encoder'] == 'lstm': encoder = StackedEncoder( [ PytorchSeq2SeqWrapper( torch.nn.LSTM(emb_sz, emb_sz, batch_first=True)) for _ in range(st_ds_conf['num_enc_layers']) ], emb_sz, emb_sz, input_dropout=st_ds_conf['intermediate_dropout']) elif st_ds_conf['encoder'] == 'bilstm': encoder = StackedEncoder( [ PytorchSeq2SeqWrapper( torch.nn.LSTM( emb_sz, emb_sz, batch_first=True, bidirectional=True)) for _ in range(st_ds_conf['num_enc_layers']) ], emb_sz, emb_sz, input_dropout=st_ds_conf['intermediate_dropout']) elif st_ds_conf['encoder'] == 'transformer': encoder = StackedEncoder( [ TransformerEncoder( input_dim=emb_sz, num_layers=st_ds_conf['num_enc_layers'], num_heads=st_ds_conf['num_heads'], feedforward_hidden_dim=emb_sz, feedforward_dropout=st_ds_conf['feedforward_dropout'], residual_dropout=st_ds_conf['residual_dropout'], attention_dropout=st_ds_conf['attention_dropout'], ) for _ in range(st_ds_conf['num_enc_layers']) ], emb_sz, emb_sz, input_dropout=st_ds_conf['intermediate_dropout']) else: assert False enc_out_dim = encoder.get_output_dim() dec_out_dim = emb_sz dec_hist_attn = get_attention(st_ds_conf, st_ds_conf['dec_hist_attn']) enc_attn = get_attention(st_ds_conf, st_ds_conf['enc_attn']) if st_ds_conf['enc_attn'] == 'dot_product': assert enc_out_dim == dec_out_dim, "encoder hidden states must be able to multiply with decoder output" def sum_attn_dims(attns, dims): return sum(dim for attn, dim in zip(attns, dims) if attn is not None) if st_ds_conf['concat_attn_to_dec_input']: dec_in_dim = dec_out_dim + sum_attn_dims([enc_attn, dec_hist_attn], [enc_out_dim, dec_out_dim]) else: dec_in_dim = dec_out_dim rnn_cell = get_rnn_cell(st_ds_conf, dec_in_dim, dec_out_dim) if st_ds_conf['concat_attn_to_dec_input']: proj_in_dim = dec_out_dim + sum_attn_dims([enc_attn, dec_hist_attn], [enc_out_dim, dec_out_dim]) else: proj_in_dim = dec_out_dim word_proj = torch.nn.Linear(proj_in_dim, vocab.get_vocab_size('lftokens')) model = BaseSeq2Seq( vocab=vocab, encoder=encoder, decoder=rnn_cell, word_projection=word_proj, source_embedding=source_embedding, target_embedding=target_embedding, target_namespace='lftokens', start_symbol=START_SYMBOL, eos_symbol=END_SYMBOL, max_decoding_step=st_ds_conf['max_decoding_len'], enc_attention=enc_attn, dec_hist_attn=dec_hist_attn, intermediate_dropout=st_ds_conf['intermediate_dropout'], concat_attn_to_dec_input=st_ds_conf['concat_attn_to_dec_input'], ) return model
def __init__(self, n_relations: int, conf: Dict, input_batchers: Dict[str, InputBatch], use_cuda: bool): super(BiaffineParser, self).__init__() self.n_relations = n_relations self.conf = conf self.use_cuda = use_cuda self.use_mst_decoding_for_validation = conf[ 'use_mst_decoding_for_validation'] input_layers = {} for i, c in enumerate(conf['input']): if c['type'] == 'embeddings': if 'pretrained' in c: embs = load_embedding_txt(c['pretrained'], c['has_header']) logger.info('loaded {0} embedding entries.'.format( len(embs[0]))) else: embs = None name = c['name'] mapping = input_batchers[name].mapping layer = Embeddings(name, c['dim'], mapping, fix_emb=c['fixed'], embs=embs, normalize=c.get('normalize', False)) logger.info('embedding for field {0} ' 'created with {1} x {2}.'.format( c['field'], layer.n_V, layer.n_d)) input_layers[name] = layer elif c['type'] == 'cnn_encoder' or c['type'] == 'lstm_encoder': name = c['name'] mapping = input_batchers[name].mapping embeddings = Embeddings('{0}_ch_emb', c['dim'], mapping, fix_emb=False, embs=None, normalize=False) logger.info('character embedding for field {0} ' 'created with {1} x {2}.'.format( c['field'], embeddings.n_V, embeddings.n_d)) if c['type'] == 'lstm_encoder': layer = LstmTokenEmbedder(name, c['dim'], embeddings, conf['dropout'], use_cuda) elif c['type'] == 'cnn_encoder': layer = ConvTokenEmbedder(name, c['dim'], embeddings, c['filters'], c.get('n_highway', 1), c.get('activation', 'relu'), use_cuda) else: raise ValueError('Unknown type: {}'.format(c['type'])) input_layers[name] = layer elif c['type'] == 'elmo': name = c['name'] layer = ContextualizedWordEmbeddings(name, c['path'], use_cuda) input_layers[name] = layer else: raise ValueError('{} unknown input layer'.format(c['type'])) self.input_layers = torch.nn.ModuleDict(input_layers) input_encoders = [] input_dim = 0 for i, c in enumerate(conf['input_encoder']): input_info = { name: [ entry['dim'] for entry in conf['input'] if entry['name'] == name ][0] for name in c['input'] } if c['type'] == 'affine': input_encoder = AffineTransformInputEncoder( input_info, c['dim'], use_cuda) elif c['type'] == 'sum': input_encoder = SummationInputEncoder(input_info, use_cuda) elif c['type'] == 'concat': input_encoder = ConcatenateInputEncoder(input_info, use_cuda) else: raise ValueError('{} unknown input encoder'.format(c['type'])) input_dim += input_encoder.get_output_dim() input_encoders.append(input_encoder) self.input_encoders = torch.nn.ModuleList(input_encoders) c = conf['context_encoder'] if c['type'] == 'stacked_bidirectional_lstm_dozat': self.encoder = PytorchSeq2SeqWrapper( InputDropoutedStackedBidirectionalLstm( DozatLstmCell, num_layers=c['num_layers'], input_size=input_dim, hidden_size=c['hidden_dim'], recurrent_dropout_probability=c[ 'recurrent_dropout_probability'], layer_dropout_probability=c['layer_dropout_probability'], activation=Activation.by_name("leaky_relu")()), stateful=False) elif c['type'] == 'stacked_bidirectional_lstm_ma': self.encoder = PytorchSeq2SeqWrapper( InputDropoutedStackedBidirectionalLstm( MaLstmCell, num_layers=c['num_layers'], input_size=input_dim, hidden_size=c['hidden_dim'], recurrent_dropout_probability=c[ 'recurrent_dropout_probability'], layer_dropout_probability=c['layer_dropout_probability'], activation=Activation.by_name("tanh")()), stateful=False) elif c['type'] == 'stacked_bidirectional_lstm': self.encoder = PytorchSeq2SeqWrapper(StackedBidirectionalLstm( num_layers=c['num_layers'], input_size=input_dim, hidden_size=c['hidden_dim'], recurrent_dropout_probability=c[ 'recurrent_dropout_probability'], layer_dropout_probability=c['layer_dropout_probability']), stateful=False) else: self.encoder = DummyContextEncoder() encoder_dim = self.encoder.get_output_dim() c = conf['biaffine_parser'] self.arc_representation_dim = arc_representation_dim = c[ 'arc_representation_dim'] self.tag_representation_dim = tag_representation_dim = c[ 'tag_representation_dim'] self.head_sentinel_ = torch.nn.Parameter( torch.randn([1, 1, encoder_dim])) self.head_arc_feedforward = FeedForward(encoder_dim, 1, arc_representation_dim, Activation.by_name("elu")()) self.child_arc_feedforward = FeedForward(encoder_dim, 1, arc_representation_dim, Activation.by_name("elu")()) self.head_tag_feedforward = FeedForward(encoder_dim, 1, tag_representation_dim, Activation.by_name("elu")()) self.child_tag_feedforward = FeedForward(encoder_dim, 1, tag_representation_dim, Activation.by_name("elu")()) arc_attention_version = c.get('arc_attention_version', 'v1') if arc_attention_version == 'v2': self.arc_attention = BilinearMatrixAttentionV2( arc_representation_dim, arc_representation_dim, use_input_biases=True) else: self.arc_attention = BilinearMatrixAttention( arc_representation_dim, arc_representation_dim, use_input_biases=True) self.tag_bilinear = BilinearWithBias(tag_representation_dim, tag_representation_dim, n_relations) self.input_dropout_ = torch.nn.Dropout2d(p=conf['dropout']) self.dropout_ = InputVariationalDropout(p=conf['dropout']) self.input_encoding_timer = TimeRecoder() self.context_encoding_timer = TimeRecoder() self.classification_timer = TimeRecoder()