def __init__(self, words, args): super(Model, self).__init__() self.args = args if args.n_e: self.n_e = args.n_e else: self.n_e = len(words) if len(words) < args.n_d else args.n_d self.n_d = args.n_d self.depth = args.depth self.drop = nn.Dropout(args.dropout) self.embedding_layer = nn.Embedding(len(words), self.n_e) self.n_V = len(words) custom_m_list = [CustomLinear(self.n_e, self.n_d * 4, bias=False)] for i in range(self.depth - 1): custom_m_list.append( flop.ProjectedLinear(self.n_d, self.n_d * 3, proj_features=args.n_proj, bias=False)) self.rnn = sru.SRU( self.n_e, self.n_d, self.depth, dropout=args.dropout, highway_bias=args.bias, layer_norm=args.layer_norm, rescale=args.rescale, custom_m=custom_m_list, ) self.output_layer = nn.Linear(self.n_d, self.n_V) self.init_weights()
def __init__(self, args): super(Model, self).__init__() self.args = args # self.cutoffs = [20000, 60000] self.cutoffs = [10000, 20000, 40000, 60000, 100000] self.n_V = args.n_token self.n_e = args.n_e or args.n_proj self.n_d = args.n_d self.depth = args.depth self.drop = nn.Dropout(args.dropout) self.embedding_layer = AdaptiveEmbedding( self.n_V, self.n_e, self.n_d, self.cutoffs, div_val=args.div_val, div_freq=2, dropout=args.dropout_e, ) self.rnn = sru.SRU( self.n_d, self.n_d, self.depth, projection_size=args.n_proj, dropout=args.dropout, highway_bias=args.bias, layer_norm=args.layer_norm, rescale=args.rescale, custom_m=flop.ProjectedLinear(self.n_d, self.n_d * 3, proj_features=args.n_proj, bias=False), ) self.output_layer = AdaptiveLogSoftmax( self.n_V, self.n_e, self.n_d, self.cutoffs, div_val=args.div_val, div_freq=2, dropout=args.dropout_e, keep_order=False, ) self.init_weights() if not args.not_tie: self.tie_weights()