def __init__(self, rnn_type: str = 'lstm', dec_hidden_size: int = 100, dec_input_size: int = 50, dropout: float = 0.1, fixed_dec_step: int = -1, max_dec_steps: int = 2, min_dec_steps: int = 2, schedule_ratio_from_ground_truth: float = 0.5, dec_avd_trigram_rep: bool = True, mult_orac_sample_one: bool = True, abs_board_file="/home/cc/exComp/board.txt", valid_tmp_path='/scratch/cluster/jcxu/exComp', serilization_name: str = ""): super().__init__() self.device = get_device() self._rnn_type = rnn_type self._dec_input_size = dec_input_size self._dec_hidden_size = dec_hidden_size self.fixed_dec_step = fixed_dec_step if fixed_dec_step == -1: self.min_dec_steps = min_dec_steps self.max_dec_steps = max_dec_steps else: self.min_dec_steps, self.max_dec_steps = fixed_dec_step, fixed_dec_step self.schedule_ratio_from_ground_truth = schedule_ratio_from_ground_truth self.mult_orac_sample_one_as_gt = mult_orac_sample_one self._dropout = nn.Dropout(dropout) self.rnn = self.build_rnn( self._rnn_type, self._dec_input_size, self._dec_hidden_size, ) self.rnn_init_state_h = torch.nn.Linear(dec_hidden_size, dec_hidden_size) self.rnn_init_state_c = torch.nn.Linear(dec_hidden_size, dec_hidden_size) self.attn = NewAttention(enc_dim=dec_input_size, dec_dim=dec_hidden_size) self.CELoss = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='none') # TODO self.rouge_metrics_sent = RougeStrEvaluation( name='sent', path_to_valid=valid_tmp_path, writting_address=valid_tmp_path, serilization_name=serilization_name) self.dec_avd_trigram_rep = dec_avd_trigram_rep
def __init__(self, inp_dim, hid_dim, dropout, nenc_lay=1, gather='sum'): super().__init__() self.hidden_dim = hid_dim self.enc_blstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(inp_dim, hid_dim, batch_first=True, bidirectional=True, num_layers=nenc_lay)) # self._span_encoder = select_gather(gather) self._span_encoder = GatherCNN( input_dim=self.enc_blstm.get_output_dim(), num_filters=5, output_dim=self.enc_blstm.get_output_dim()) self._dropout = torch.nn.Dropout(p=dropout) self.device = get_device()
def __init__(self, context_dim, dec_state_dim, enc_hid_dim, text_field_embedder, aggressive_compression: int = -1, keep_threshold: float = 0.5, abs_board_file="/home/cc/exComp/board.txt", gather='mean', dropout=0.5, dropout_emb=0.2, valid_tmp_path='/scratch/cluster/jcxu/exComp', serilization_name: str = "", vocab=None, elmo: bool = False, elmo_weight: str = "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"): super().__init__() self.use_elmo = elmo self.serilization_name = serilization_name if elmo: from allennlp.modules.elmo import Elmo, batch_to_ids from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper self.vocab = vocab options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" weight_file = elmo_weight self.elmo = Elmo(options_file, weight_file, 1, dropout=dropout_emb) # print(self.elmo.get_output_dim()) # self.word_emb_dim = text_field_embedder.get_output_dim() # self._context_layer = PytorchSeq2SeqWrapper( # torch.nn.LSTM(self.word_emb_dim + self.elmo.get_output_dim(), self.word_emb_dim, # batch_first=True, bidirectional=True)) self.word_emb_dim = self.elmo.get_output_dim() else: self._text_field_embedder = text_field_embedder self.word_emb_dim = text_field_embedder.get_output_dim() self.XEloss = torch.nn.CrossEntropyLoss(reduction='none') self.device = get_device() # self.rouge_metrics_compression = RougeStrEvaluation(name='cp', path_to_valid=valid_tmp_path, # writting_address=valid_tmp_path, # serilization_name=serilization_name) # self.rouge_metrics_compression_best_possible = RougeStrEvaluation(name='cp_ub', path_to_valid=valid_tmp_path, # writting_address=valid_tmp_path, # serilization_name=serilization_name) self.enc = EncCompression(inp_dim=self.word_emb_dim, hid_dim=enc_hid_dim, gather=gather) # TODO dropout self.aggressive_compression = aggressive_compression self.relu = torch.nn.ReLU() self.attn = NewAttention(enc_dim=self.enc.get_output_dim(), dec_dim=self.enc.get_output_dim_unit() * 2 + dec_state_dim) self.concat_size = self.enc.get_output_dim() + self.enc.get_output_dim_unit() * 2 + dec_state_dim self.valid_tmp_path = valid_tmp_path if self.aggressive_compression < 0: self.XELoss = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=-1) # self.nn_lin = torch.nn.Linear(self.concat_size, self.concat_size) # self.nn_lin2 = torch.nn.Linear(self.concat_size, 2) self.ff = FeedForward(input_dim=self.concat_size, num_layers=3, hidden_dims=[self.concat_size, self.concat_size, 2], activations=[torch.nn.Tanh(), torch.nn.Tanh(), lambda x: x], dropout=dropout ) # Keep thresold # self.keep_thres = list(np.arange(start=0.2, stop=0.6, step=0.075)) self.keep_thres = [0.0, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 1.0] self.rouge_metrics_compression_dict = OrderedDict() for thres in self.keep_thres: self.rouge_metrics_compression_dict["{}".format(thres)] = RougeStrEvaluation(name='cp_{}'.format(thres), path_to_valid=valid_tmp_path, writting_address=valid_tmp_path, serilization_name=serilization_name)
def build_model( vocab, embed_dim: int = 100, hid_dim: int = 100, min_dec_step: int = 2, max_decoding_steps: int = 3, fix_edu_num: int = -1, use_elmo: bool = False, dropout=0.5, dropout_emb=0.2, span_encoder_type='self_attentive', attn_type='dot', schedule_ratio_from_ground_truth=0.7, pretrain_embedding=None, nenc_lay: int = 1, mult_orac_sampling: bool = True, compression: bool = True, word_token_indexers=None, alpha: float = 1.0, dbg: bool = False, dec_avd_trigram_rep: bool = True, aggressive_compression: int = -1, keep_threshold: float = 0.5, weight_alpha=0.0, bias_alpha=0.0, abs_board_file: str = "/home/cc/exComp/board.txt", compress_leadn=-1, gather='mean', abs_dir_root: str = "/scratch/cluster/jcxu", serilization_name="", load_save_model: str = None ): model = Seq2IdxSum( vocab=vocab, word_embedding_dim=embed_dim, hidden_dim=hid_dim, min_dec_step=min_dec_step, max_decoding_steps=max_decoding_steps, fix_edu_num=fix_edu_num, use_elmo=use_elmo, span_encoder_type=span_encoder_type, dropout=dropout, dropout_emb=dropout_emb, attn_type=attn_type, schedule_ratio_from_ground_truth=schedule_ratio_from_ground_truth, pretrain_embedding_file=pretrain_embedding, nenc_lay=nenc_lay, mult_orac_sampling=mult_orac_sampling, word_token_indexers=word_token_indexers, compression=compression, alpha=alpha, dbg=dbg, dec_avd_trigram_rep=dec_avd_trigram_rep, aggressive_compression=aggressive_compression, keep_threshold=keep_threshold, regularizer=RegularizerApplicator([("weight", L2Regularizer(weight_alpha)), ("bias", L1Regularizer(bias_alpha))]), abs_board_file=abs_board_file, gather=gather, compress_leadn=compress_leadn, abs_dir_root=abs_dir_root, serilization_name=serilization_name ) if load_save_model: model.load_state_dict(torch.load(load_save_model, map_location=get_device())) # `` model.load_state_dict(torch.load("/path/to/model/weights.th"))`` # model = torch.nn.DataParallel(model) device = get_device() model = model.to(device) return model
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, word_embedding_dim: int = 200, hidden_dim: int = 200, dropout_emb: float = 0.5, min_dec_step: int = 2, max_decoding_steps=3, fix_edu_num=-1, dropout: float = 0.5, alpha: float = 0.5, span_encoder_type='self_attentive', use_elmo: bool = True, attn_type: str = 'general', schedule_ratio_from_ground_truth: float = 0.8, pretrain_embedding_file=None, nenc_lay: int = 2, mult_orac_sampling: bool = False, word_token_indexers=None, compression: bool = True, dbg: bool = False, dec_avd_trigram_rep: bool = True, aggressive_compression: int = -1, compress_leadn: int = -1, subsentence: bool = False, gather='mean', keep_threshold: float = 0.5, abs_board_file: str = "/home/cc/exComp/board.txt", abs_dir_root: str = "/scratch/cluster/jcxu", serilization_name: str = "", ) -> None: super(Seq2IdxSum, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder elmo_weight = os.path.join( abs_dir_root, "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5") # if not os.path.isfile(elmo_weight): # import subprocess # x = "wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5 -P {}".format(abs_dir_root) # subprocess.run(x.split(" ")) self.device = get_device() self.vocab = vocab self.dbg = dbg self.loss_thres = keep_threshold self.compression = compression self.comp_leadn = compress_leadn # Just encode the whole document without looking at compression options self.enc_doc = EncDoc(inp_dim=word_embedding_dim, hid_dim=hidden_dim, vocab=vocab, dropout=dropout, dropout_emb=dropout_emb, pretrain_embedding_file=pretrain_embedding_file, gather=gather) self.sent_dec = SentRNNDecoder( rnn_type='lstm', dec_hidden_size=self.enc_doc.get_output_dim(), dec_input_size=self.enc_doc.get_output_dim(), dropout=dropout, fixed_dec_step=fix_edu_num, max_dec_steps=max_decoding_steps, min_dec_steps=min_dec_step, schedule_ratio_from_ground_truth=schedule_ratio_from_ground_truth, dec_avd_trigram_rep=dec_avd_trigram_rep, mult_orac_sample_one=mult_orac_sampling, abs_board_file=abs_board_file, valid_tmp_path=abs_dir_root, serilization_name=serilization_name) if compression: self.compression_dec = CompressDecoder( context_dim=hidden_dim * 2, dec_state_dim=hidden_dim * 2, enc_hid_dim=hidden_dim, text_field_embedder=self.enc_doc._text_field_embedder, aggressive_compression=aggressive_compression, keep_threshold=keep_threshold, abs_board_file=abs_board_file, gather=gather, dropout=dropout, dropout_emb=dropout_emb, valid_tmp_path=abs_dir_root, serilization_name=serilization_name, vocab=vocab, elmo=use_elmo, elmo_weight=elmo_weight) self.aggressive_compression = aggressive_compression self.use_elmo = use_elmo if use_elmo: options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" self.elmo = Elmo(options_file, weight_file, 1, dropout=0) # print(self.elmo.get_output_dim()) self._context_layer = PytorchSeq2SeqWrapper( torch.nn.LSTM(word_embedding_dim + self.elmo.get_output_dim(), hidden_dim, batch_first=True, bidirectional=True)) else: self._context_layer = PytorchSeq2SeqWrapper( torch.nn.LSTM(word_embedding_dim, hidden_dim, batch_first=True, bidirectional=True)) token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=word_embedding_dim) if pretrain_embedding_file is not None: logger = logging.getLogger() logger.info( "Loading word embedding: {}".format(pretrain_embedding_file)) token_embedding.from_params(vocab=vocab, params=Params({ "pretrained_file": pretrain_embedding_file, "embedding_dim": word_embedding_dim })) self._text_field_embedder = BasicTextFieldEmbedder( {"tokens": token_embedding}) # if span_encoder_type == 'self_attentive': # self._span_encoder = SelfAttentiveSpanExtractor( # self._context_layer.get_output_dim() # ) # else: # raise NotImplementedError self._dropout = torch.nn.Dropout(p=dropout) self._max_decoding_steps = max_decoding_steps self._fix_edu_num = fix_edu_num if compression: pass # self.rouge_metrics_compression = self.compression_dec.rouge_metrics_compression # self.rouge_metrics_compression_upper_bound = self.compression_dec.rouge_metrics_compression_best_possible self.rouge_metrics_sent = self.sent_dec.rouge_metrics_sent self.mult_orac_sampling = mult_orac_sampling self.alpha = alpha initializer(self) if regularizer is not None: regularizer(self) self.counter = 0 # used for controlling compression and extraction