Exemple #1
0
 def _get_embtype(self, emb_type):
     # set up preinitialized embeddings
     try:
         import torchtext.vocab as vocab
     except ImportError as ex:
         print('Please install torch text with `pip install torchtext`')
         raise ex
     pretrained_dim = 300
     if emb_type.startswith('glove'):
         if 'twitter' in emb_type:
             init = 'glove-twitter'
             name = 'twitter.27B'
             pretrained_dim = 200
         else:
             init = 'glove'
             name = '840B'
         embs = vocab.GloVe(name=name,
                            dim=pretrained_dim,
                            cache=modelzoo_path(self.opt.get('datapath'),
                                                'models:glove_vectors'))
     elif emb_type.startswith('fasttext'):
         init = 'fasttext'
         embs = vocab.FastText(language='en',
                               cache=modelzoo_path(
                                   self.opt.get('datapath'),
                                   'models:fasttext_vectors'))
     else:
         raise RuntimeError('embedding type {} not implemented. check arg, '
                            'submit PR to this function, or override it.'
                            ''.format(emb_type))
     return embs, init
Exemple #2
0
 def _get_embtype(self, emb_type):
     # set up preinitialized embeddings
     try:
         import torchtext.vocab as vocab
     except ImportError as ex:
         print('Please install torch text with `pip install torchtext`')
         raise ex
     pretrained_dim = 300
     if emb_type.startswith('glove'):
         if 'twitter' in emb_type:
             init = 'glove-twitter'
             name = 'twitter.27B'
             pretrained_dim = 200
         else:
             init = 'glove'
             name = '840B'
         embs = vocab.GloVe(name=name,
                            dim=pretrained_dim,
                            cache=modelzoo_path(self.opt.get('datapath'),
                                                'models:glove_vectors'))
     elif emb_type.startswith('fasttext_cc'):
         init = 'fasttext_cc'
         embs = vocab.FastText(language='en',
                               cache=modelzoo_path(
                                   self.opt.get('datapath'),
                                   'models:fasttext_cc_vectors'))
     elif emb_type.startswith('fasttext'):
         init = 'fasttext'
         embs = vocab.FastText(language='en',
                               cache=modelzoo_path(
                                   self.opt.get('datapath'),
                                   'models:fasttext_vectors'))
     else:
         # emb_type does not matching any type embeddings list above,
         # so we think it is a file_path to the embedding file,
         # if not, raise error
         assert os.path.isfile(emb_type), \
             'emb_type: {} does not matching any type embeddings list above, '.format(emb_type) + \
             'so we think it is a file_path to the embedding file!'
         init = os.path.basename(emb_type)
         cache = '.vector_cache'
         if not os.path.exists(cache):
             os.makedirs(cache)
         embs = vocab.Vectors(emb_type, cache=cache)
     return embs, init
    def get_embedding(self, name, embedding_dim):
        if name == 'glove':
            pretrained_type = vocab.GloVe(name='42B', dim=embedding_dim)
        elif name == 'fasttext':
            if embedding_dim != 300:
                raise ValueError("Got embedding dim {}, expected size 300".format(embedding_dim))
            pretrained_type = vocab.FastText('en')

        embedding_len = len(self)
        weights = np.zeros((embedding_len, embedding_dim))
        words_found = 0

        for word, index in self.word2idx.items():
            try:
                # torchtext.vocab.__getitem__ defaults key error to a zero vector
                weights[index] = pretrained_type.vectors[pretrained_type.stoi[word]]
                words_found += 1
            except KeyError:
                if index == 0:
                    continue
                weights[index] = np.random.normal(scale=0.6, size=(embedding_dim))

        print(embedding_len - words_found, "words missing from pretrained")
        return torch.from_numpy(weights).float()
Exemple #4
0
    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init

        # all instances may need some params
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.history = {}
        self.states = {}

        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()

        if shared:
            # set up shared properties
            self.dict = shared['dict']
            self.START_IDX = shared['START_IDX']
            self.END_IDX = shared['END_IDX']
            self.NULL_IDX = shared['NULL_IDX']
            # answers contains a batch_size list of the last answer produced
            self.answers = shared['answers']

            if 'model' in shared:
                # model is shared during hogwild
                self.model = shared['model']
                self.states = shared['states']
        else:
            # this is not a shared instance of this class, so do full init
            # answers contains a batch_size list of the last answer produced
            self.answers = [None] * opt['batchsize']

            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' +
                      opt['model_file'])
                new_opt, self.states = self.load(opt['model_file'])
                # override model-specific options with stored ones
                opt = self.override_opt(new_opt)

            if opt['dict_file'] is None and opt.get('model_file'):
                # set default dict-file if not set
                opt['dict_file'] = opt['model_file'] + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            # we use START markers to start our output
            self.START_IDX = self.dict[self.dict.start_token]
            # we use END markers to end our output
            self.END_IDX = self.dict[self.dict.end_token]
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict[self.dict.null_token]

            self.model = Seq2seq(opt,
                                 len(self.dict),
                                 padding_idx=self.NULL_IDX,
                                 start_idx=self.START_IDX,
                                 end_idx=self.END_IDX,
                                 longest_label=self.states.get(
                                     'longest_label', 1))

            if opt['embedding_type'] != 'random':
                # set up preinitialized embeddings
                try:
                    import torchtext.vocab as vocab
                except ModuleNotFoundError as ex:
                    print(
                        'Please install torch text with `pip install torchtext`'
                    )
                    raise ex
                if opt['embedding_type'].startswith('glove'):
                    init = 'glove'
                    embs = vocab.GloVe(name='840B', dim=300)
                elif opt['embedding_type'].startswith('fasttext'):
                    init = 'fasttext'
                    embs = vocab.FastText(language='en')
                else:
                    raise RuntimeError('embedding type not implemented')

                if opt['embeddingsize'] != 300:
                    rp = torch.Tensor(300, opt['embeddingsize']).normal_()
                    t = lambda x: torch.mm(x.unsqueeze(0), rp)
                else:
                    t = lambda x: x
                cnt = 0
                for w, i in self.dict.tok2ind.items():
                    if w in embs.stoi:
                        vec = t(embs.vectors[embs.stoi[w]])
                        self.model.decoder.lt.weight.data[i] = vec
                        cnt += 1
                        if opt['lookuptable'] in ['unique', 'dec_out']:
                            # also set encoder lt, since it's not shared
                            self.model.encoder.lt.weight.data[i] = vec
                print('Seq2seq: initialized embeddings for {} tokens from {}.'
                      ''.format(cnt, init))

            if self.states:
                # set loaded states if applicable
                self.model.load_state_dict(self.states['model'])

            if self.use_cuda:
                self.model.cuda()

        if hasattr(self, 'model'):
            # if model was built, do more setup
            self.clip = opt.get('gradient_clip', 0.2)
            self.rank = opt['rank_candidates']

            # set up tensors once
            self.xs = torch.LongTensor(1, 1)
            self.ys = torch.LongTensor(1, 1)
            if self.rank:
                self.cands = torch.LongTensor(1, 1, 1)

            # set up criteria
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX)

            if self.use_cuda:
                # push to cuda
                self.xs = self.xs.cuda(async=True)
                self.ys = self.ys.cuda(async=True)
                if self.rank:
                    self.cands = self.cands.cuda(async=True)
                self.criterion.cuda()

            # set up optimizer
            lr = opt['learningrate']
            optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']]
            kwargs = {'lr': lr}
            if opt['optimizer'] == 'sgd':
                kwargs['momentum'] = 0.95
                kwargs['nesterov'] = True

            if opt['embedding_type'].endswith('fixed'):
                print('Seq2seq: fixing embedding weights.')
                self.model.decoder.lt.weight.requires_grad = False
                self.model.encoder.lt.weight.requires_grad = False
                if opt['lookuptable'] in ['dec_out', 'all']:
                    self.model.decoder.e2s.weight.requires_grad = False
            self.optimizer = optim_class(
                [p for p in self.model.parameters() if p.requires_grad],
                **kwargs)
            if self.states:
                if self.states['optimizer_type'] != opt['optimizer']:
                    print('WARNING: not loading optim state since optim class '
                          'changed.')
                else:
                    self.optimizer.load_state_dict(self.states['optimizer'])

        self.reset()
Exemple #5
0
    def __init__(self, opt, shared=None):
        """Set up model."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init
        #self.opt = opt
        # all instances may need some params
        opt['label_smoothing'] = False
        opt['src_tgt_weight_share'] = False
        opt['tgt_prj_weight_share'] = False
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.metrics = {
            'loss': 0.0,
            'num_tokens': 0,
            'correct_tokens': 0,
            'total_skipped_batches': 0
        }
        self.history = {}
        self.report_freq = opt.get('report_freq', 0.001)
        self.use_person_tokens = opt.get('person_tokens', False)
        self.batch_idx = shared and shared.get('batchindex') or 0
        self.rank = opt['rank_candidates']
        self.beam_size = opt.get('beam_size', 1)
        self.topk = opt.get('topk', 1)
        states = {}

        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
        if opt.get('numthreads', 1) > 1:
            torch.set_num_threads(1)

        if shared:
            # set up shared properties
            self.opt = shared['opt']
            opt = self.opt
            self.dict = shared['dict']
            self.START_IDX = shared['START_IDX']
            self.END_IDX = shared['END_IDX']
            self.NULL_IDX = shared['NULL_IDX']
            # answers contains a batch_size list of the last answer produced
            self.answers = shared['answers']
            self.model = shared['model']
            self.metrics = shared['metrics']
            states = shared.get('states', {})

        else:
            # this is not a shared instance of this class, so do full init
            # answers contains a batch_size list of the last answer produced
            self.answers = [None] * opt['batchsize']

            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            init_model = None
            # check first for 'init_model' for loading model from file
            if opt.get('init_model') and os.path.isfile(opt['init_model']):
                init_model = opt['init_model']
            # next check for 'model_file', this would override init_model
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                init_model = opt['model_file']

            if init_model is not None:
                # load model parameters if available
                print('[ Loading existing model params from {} ]'.format(
                    init_model))
                states = self.load(init_model)

                if os.path.isfile(init_model +
                                  '.dict') or opt['dict_file'] is None:
                    opt['dict_file'] = init_model + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'Transformer'
            # we use START markers to start our output
            self.START_IDX = self.dict[self.dict.start_token]
            # we use END markers to end our output
            self.END_IDX = self.dict[self.dict.end_token]
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict[self.dict.null_token]

            if not hasattr(self, 'model_class'):
                # this allows child classes to override this but inherit init
                self.model_class = Transformer
            # self.model = self.model_class(
            #     opt, len(self.dict), padding_idx=self.NULL_IDX,
            #     start_idx=self.START_IDX, end_idx=self.END_IDX,
            #     longest_label=states.get('longest_label', 1))
            self.model = self.model_class(len(self.dict), opt)

            if opt.get('dict_tokenizer'
                       ) == 'bpe' and opt['embedding_type'] != 'random':
                print('skipping preinitialization of embeddings for bpe')
            elif not states and opt['embedding_type'] != 'random':
                # set up preinitialized embeddings
                try:
                    import torchtext.vocab as vocab
                except ImportError as ex:
                    print(
                        'Please install torch text with `pip install torchtext`'
                    )
                    raise ex
                pretrained_dim = 300
                if opt['embedding_type'].startswith('glove'):
                    if 'twitter' in opt['embedding_type']:
                        init = 'glove-twitter'
                        name = 'twitter.27B'
                        pretrained_dim = 200
                    else:
                        init = 'glove'
                        name = '840B'
                    embs = vocab.GloVe(name=name,
                                       dim=pretrained_dim,
                                       cache=modelzoo_path(
                                           self.opt.get('datapath'),
                                           'models:glove_vectors'))
                elif opt['embedding_type'].startswith('fasttext'):
                    init = 'fasttext'
                    embs = vocab.FastText(language='en',
                                          cache=modelzoo_path(
                                              self.opt.get('datapath'),
                                              'models:fasttext_vectors'))
                else:
                    raise RuntimeError('embedding type not implemented')

                if opt['embeddingsize'] != pretrained_dim:
                    rp = torch.Tensor(pretrained_dim,
                                      opt['embeddingsize']).normal_()
                    t = lambda x: torch.mm(x.unsqueeze(0), rp)
                else:
                    t = lambda x: x
                cnt = 0
                for w, i in self.dict.tok2ind.items():
                    if w in embs.stoi:
                        vec = t(embs.vectors[embs.stoi[w]])
                        self.model.decoder.tgt_word_emb.weight.data[i] = vec
                        cnt += 1
                        if opt['lookuptable'] in ['unique', 'dec_out']:
                            # also set encoder lt, since it's not shared
                            self.model.encoder.src_word_emb.weight.data[
                                i] = vec
                print(
                    'Transformer: initialized embeddings for {} tokens from {}.'
                    ''.format(cnt, init))

            if states:
                # set loaded states if applicable
                self.model.load_state_dict(states['model'])

            if self.use_cuda:
                self.model.cuda()

        # set up criteria
        if opt.get('numsoftmax', 1) > 1:
            self.criterion = nn.NLLLoss(ignore_index=self.NULL_IDX,
                                        size_average=False)
        else:
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX,
                                                 size_average=False)

        if self.use_cuda:
            self.criterion.cuda()

        if 'train' in opt.get('datatype', ''):
            # we only set up optimizers when training
            # we only set this up for the original instance or hogwild ones
            self.clip = opt.get('gradient_clip', -1)

            # set up optimizer
            lr = opt['learningrate']
            optim_class = TransformerAgent.OPTIM_OPTS[opt['optimizer']]
            kwargs = {'lr': lr}
            if opt.get('momentum') > 0 and opt['optimizer'] in [
                    'sgd', 'rmsprop'
            ]:
                kwargs['momentum'] = opt['momentum']
                if opt['optimizer'] == 'sgd':
                    kwargs['nesterov'] = True
            if opt['optimizer'] == 'adam':
                # https://openreview.net/forum?id=ryQu7f-RZ
                kwargs['amsgrad'] = True

            if opt['embedding_type'].endswith('fixed'):
                print('Transformer: fixing embedding weights.')
                self.model.decoder.tgt_word_emb.weight.requires_grad = False
                self.model.encoder.src_word_emb.weight.requires_grad = False
                if opt['lookuptable'] in ['dec_out', 'all']:
                    # self.model.decoder.e2s.weight.requires_grad = False
                    self.model.tgt_word_prj.weight.requires_grad = False
            self.optimizer = optim_class(
                [p for p in self.model.parameters() if p.requires_grad],
                **kwargs)
            if states.get('optimizer'):
                if states['optimizer_type'] != opt['optimizer']:
                    print('WARNING: not loading optim state since optim class '
                          'changed.')
                else:
                    try:
                        self.optimizer.load_state_dict(states['optimizer'])
                    except ValueError:
                        print('WARNING: not loading optim state since model '
                              'params changed.')
                    if self.use_cuda:
                        for state in self.optimizer.state.values():
                            for k, v in state.items():
                                if isinstance(v, torch.Tensor):
                                    state[k] = v.cuda()
            self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                self.optimizer, 'min', factor=0.5, patience=3, verbose=True)

        self.reset()
Exemple #6
0
    if not args.no_cuda:
        zero, diag_margin = zero.cuda(), diag_margin.cuda()
    zero, diag_margin = Variable(zero), Variable(diag_margin)

    x = x / torch.norm(x, 2, 1, keepdim=True)
    v = v / torch.norm(v, 2, 1, keepdim=True)
    prod = torch.matmul(x, v.transpose(0, 1))
    diag = torch.diag(prod)
    for_x = torch.max(zero, margin - torch.unsqueeze(diag, 1) + prod) - diag_margin
    for_v = torch.max(zero, margin - torch.unsqueeze(diag, 0) + prod) - diag_margin
    return (torch.sum(for_x) + torch.sum(for_v)) / x.size(0)


if __name__ == '__main__':
    print('Loading a pretrained fastText model...')
    word_embedding = vocab.FastText(language="en")
    #word_embedding =fasttext.load_model(args.fasttext_model)

    print('Loading a dataset...')
    train_data = ReedICML2016(args.img_root,
                              args.caption_root,
                              args.trainclasses_file,
                              word_embedding,
                              args.max_nwords,
                              transforms.Compose([
                                  transforms.Scale(256),
                                  transforms.RandomCrop(224),
                                  transforms.RandomHorizontalFlip(),
                                  transforms.ToTensor(),
                                  transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                       std=[0.229, 0.224, 0.225])
Exemple #7
0
def download(datapath):
    embs = vocab.FastText(language='en',
                          cache=datapath + '/models/fasttext_vectors')
    def __init__(self, opt, shared=None):
        """Set up model."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init

        # all instances may need some params
        self.encode_max_seq_len = opt[
            'encode_max_seq_len'] if opt['encode_max_seq_len'] > 0 else None
        self.decode_max_seq_len = opt[
            'decode_max_seq_len'] if opt['decode_max_seq_len'] > 0 else None

        self.metrics = {
            'loss': 0.0,
            'num_tokens': 0,
            'correct_tokens': 0,
            'total_skipped_batches': 0,
            'correct_pred': 0,
            'pred_count': 0
        }

        self.history = {}
        # batch share the same persona information
        self.use_person_tokens = opt.get('use_persona_token', False)
        self.use_talk_tokens = opt.get('use_talk_token', False)
        self.use_history_reply = opt.get('history_replies', 'label_else_model')
        self.add_default_persona = opt.get('add_default_persona', True)
        self.persona_append_strategy = opt.get('persona_append_strategy',
                                               'concat')
        self.history_append_strategy = opt.get('history_append_strategy', -1)

        self.report_freq = opt.get('report_freq', 0.001)
        self.batch_idx = shared and shared.get('batchindex') or 0
        self.rank = opt['rank_candidates']
        self.beam_size = opt.get('beam_size', 1)
        self.topk = opt.get('topk', 1)
        states = {}

        # if gpt2
        if 'gpt' in ARCH_CHOICE:
            num_optim_steps = opt['train_size'] * opt[
                'num_train_epochs'] // opt['batchsize']
            # override optimizer_step
            opt['optimizer_step'] = num_optim_steps

        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()

        if shared:
            # set up shared properties
            self.opt = shared['opt']
            opt = self.opt
            self.dict = shared['dict']
            self.START_IDX = shared['START_IDX']
            self.END_IDX = shared['END_IDX']
            self.NULL_IDX = shared['NULL_IDX']
            # answers contains a batchsize list of the last answer produced
            self.answers = shared['answers']
            self.model = shared['model']
            self.metrics = shared['metrics']
            self.receiver = shared['receiver']
            self.receiver_dict = shared['receiver_dict']
            states = shared.get('states', {})
        else:
            # this is not a shared instance of this class, so do full init
            # answers contains a batchsize list of the last answer produced
            self.answers = [None] * opt['batchsize']

            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            init_model = None
            # check first for 'init_model' for loading model from file
            if opt.get('init_model') and os.path.isfile(opt['init_model']):
                init_model = opt['init_model']
            # next check for 'model_file', this would override init_model
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                init_model = opt['model_file']

            if init_model is not None:
                # load model parameters if available
                print('[ Loading existing model params from {} ]'.format(
                    init_model))
                states = self.load(init_model)

                if os.path.isfile(init_model +
                                  '.dict') or opt['dict_file'] is None:
                    opt['dict_file'] = init_model + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = self.dictionary_class()(opt)
            self.id = 'Transformer'
            # we use START markers to start our output
            self.START_IDX = self.dict[self.dict.start_token]
            # we use END markers to end our output
            self.END_IDX = self.dict[self.dict.end_token]
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict[self.dict.null_token]

            # get vocab size
            vocab_size = len(self.dict.tok2ind.items())

            if ARCH_CHOICE == 'lstm':
                self.model = Seq2seqModel(opt=opt,
                                          num_features=len(self.dict),
                                          padding_idx=self.NULL_IDX,
                                          start_idx=self.START_IDX,
                                          end_idx=self.END_IDX,
                                          longest_label=states.get(
                                              'longest_label', 1))
            elif ARCH_CHOICE == 'gpt':
                assert isinstance(self.dict, GPTDictionaryAgent)
                self.model = Gpt2SeqModel(
                    opt=opt,
                    vocab_size=len(self.dict),
                    pad_idx=self.NULL_IDX,
                    start_idx=self.START_IDX,
                    end_idx=self.END_IDX,
                    dict=self.dict,
                    special_token_len=len(self.dict.special_tokens),
                    longest_label=states.get('longest_label', 1))

            if opt.get('display_model', False):
                print_model(self.model)

            if opt.get('dict_tokenizer'
                       ) == 'bpe' and opt['embedding_type'] != 'random':
                print('skipping preinitialization of embeddings for bpe')

            elif not states and opt[
                    'embedding_type'] != 'random' and ARCH_CHOICE == 'lstm':
                # set up preinitialized embeddings
                try:
                    import torchtext.vocab as vocab
                except ImportError as ex:
                    print(
                        'Please install torch text with `pip install torchtext`'
                    )
                    raise ex
                pretrained_dim = 300
                if opt['embedding_type'].startswith('glove'):
                    if 'twitter' in opt['embedding_type']:
                        init = 'glove-twitter'
                        name = 'twitter.27B'
                        pretrained_dim = 200
                    else:
                        init = 'glove'
                        name = '840B'
                    embs = vocab.GloVe(name=name,
                                       dim=pretrained_dim,
                                       cache=modelzoo_path(
                                           self.opt.get('datapath'),
                                           'models:glove_vectors'))
                elif opt['embedding_type'].startswith('fasttext'):
                    init = 'fasttext'
                    embs = vocab.FastText(language='en',
                                          cache=modelzoo_path(
                                              self.opt.get('datapath'),
                                              'models:fasttext_vectors'))
                else:
                    raise RuntimeError('embedding type not implemented')

                if opt['encoder_embed_dim'] != pretrained_dim:
                    rp = torch.Tensor(pretrained_dim,
                                      opt['encoder_embed_dim']).normal_()
                    t = lambda x: torch.mm(x.unsqueeze(0), rp)
                else:
                    t = lambda x: x
                cnt = 0
                for w, i in self.dict.tok2ind.items():
                    if w in embs.stoi:
                        vec = t(embs.vectors[embs.stoi[w]])
                        self.model.decoder.tgt_word_emb.weight.data[i] = vec
                        cnt += 1
                        if opt['lookuptable'] in ['unique', 'dec_out']:
                            # also set encoder lt, since it's not shared
                            self.model.encoder.src_word_emb.weight.data[
                                i] = vec
                print('Seq2seq: initialized embeddings for {} tokens from {}.'
                      ''.format(cnt, init))

            if states:
                # set loaded states if applicable
                self.model.load_state_dict(states['model'])

            if self.use_cuda:
                self.model.cuda()

            # if select persona
            if opt['select_persona']:
                self.receiver, self.receiver_dict = self.load_receiver(
                    opt['receiver_model'])
                self.receiver.eval()
                # move to cuda
                self.receiver.cuda()
            else:
                self.receiver = None
                self.receiver_dict = None

        vocab_size = len(self.dict.tok2ind.items())

        if opt['smoothing'] > 0.0:
            self.criterion = LabelSmoothingLoss(
                vocabulary_size=40516,
                label_smoothing=opt['smoothing'],
                pad_index=self.NULL_IDX)
        else:
            self.criterion = TokenCrossEntropyLoss(pad_index=self.NULL_IDX)

        self.class_criter = nn.CrossEntropyLoss()
        self.eval_criterion = TokenCrossEntropyLoss(pad_index=self.NULL_IDX)
        # whether shuffle persona
        self.shuffle_persona = opt['shuffle_persona']

        if self.use_cuda:
            self.criterion.cuda()

        if 'train' in opt.get('datatype', ''):
            # we only set up optimizers when training
            # we only set this up for the original instance or hogwild ones
            self.clip = opt.get('gradient_clip', -1)

            # set up optimizer
            lr = opt['lr']
            optim_class = TransformerAgent.OPTIM_OPTS[opt['optimizer']]
            if ARCH_CHOICE == 'lstm':
                kwargs = {'lr': lr}
                if opt.get('momentum') > 0 and opt['optimizer'] in [
                        'sgd', 'rmsprop'
                ]:
                    kwargs['momentum'] = opt['momentum']
                    if opt['optimizer'] == 'sgd':
                        kwargs['nesterov'] = True
                if opt['optimizer'] == 'adam':
                    kwargs['amsgrad'] = True

                if opt['embedding_type'].endswith('fixed'):
                    print('Transformer: fixing embedding weights.')
                    self.model.decoder.tgt_word_emb.weight.requires_grad = False
                    self.model.encoder.src_word_emb.weight.requires_grad = False

                    if opt['lookuptable'] in ['dec_out', 'all']:
                        self.model.decoder.e2s.weight.requires_grad = False
                self.optimizer = optim_class(
                    [p for p in self.model.parameters() if p.requires_grad],
                    **kwargs)
            elif ARCH_CHOICE == 'gpt':
                self.optimizer = GPTOptimizer(self.model, opt)

            if states.get('optimizer'):
                if states['optimizer_type'] != opt['optimizer']:
                    print('WARNING: not loading optim state since optim class '
                          'changed.')
                else:
                    try:
                        self.optimizer.load_state_dict(states['optimizer'])
                    except ValueError:
                        print('WARNING: not loading optim state since model '
                              'params changed.')
                    # if self.use_cuda:
                    #     for state in self.optimizer.state.values():
                    #         for k, v in state.items():
                    #             if isinstance(v, torch.Tensor):
                    #                 state[k] = v.cuda()
            if ARCH_CHOICE == 'lstm':
                self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                    self.optimizer,
                    'min',
                    factor=0.5,
                    patience=3,
                    verbose=True)

        self.step = torch.zeros(1)
        self.reset()
Exemple #9
0
    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init

        # all instances may need some params
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.metrics = {'loss': 0.0, 'num_tokens': 0}
        self.history = {}
        self.report_freq = opt.get('report_freq', 0.001)
        states = {}

        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
        if opt.get('numthreads', 1) > 1:
            torch.set_num_threads(1)

        if shared:
            # set up shared properties
            self.opt = shared['opt']
            opt = self.opt
            self.dict = shared['dict']
            self.START_IDX = shared['START_IDX']
            self.END_IDX = shared['END_IDX']
            self.NULL_IDX = shared['NULL_IDX']
            # answers contains a batch_size list of the last answer produced
            self.answers = shared['answers']

            if 'model' in shared:
                # model is shared during hogwild
                self.model = shared['model']
                self.metrics = shared['metrics']
                states = shared['states']
        else:
            # this is not a shared instance of this class, so do full init
            # answers contains a batch_size list of the last answer produced
            self.answers = [None] * opt['batchsize']

            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            init_model = None
            # check first for 'init_model' for loading model from file
            if opt.get('init_model') and os.path.isfile(opt['init_model']):
                init_model = opt['init_model']
            # next check for 'model_file', this would override init_model
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                init_model = opt['model_file']

            if init_model is not None:
                # load model parameters if available
                print('[ Loading existing model params from {} ]'.format(
                    init_model))
                states = self.load(opt['model_file'])

            if ((init_model is not None
                 and os.path.isfile(init_model + '.dict'))
                    or opt['dict_file'] is None):
                opt['dict_file'] = init_model + '.dict'
            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            # we use START markers to start our output
            self.START_IDX = self.dict[self.dict.start_token]
            # we use END markers to end our output
            self.END_IDX = self.dict[self.dict.end_token]
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict[self.dict.null_token]

            if not hasattr(self, 'model_class'):
                # this allows child classes to override this but inherit init
                self.model_class = Seq2seq
            self.model = self.model_class(opt,
                                          len(self.dict),
                                          padding_idx=self.NULL_IDX,
                                          start_idx=self.START_IDX,
                                          end_idx=self.END_IDX,
                                          longest_label=states.get(
                                              'longest_label', 1))

            if opt['embedding_type'] != 'random':
                # set up preinitialized embeddings
                try:
                    import torchtext.vocab as vocab
                except ModuleNotFoundError as ex:
                    print(
                        'Please install torch text with `pip install torchtext`'
                    )
                    raise ex
                if opt['embedding_type'].startswith('glove'):
                    init = 'glove'
                    embs = vocab.GloVe(name='840B',
                                       dim=300,
                                       cache=os.path.join(
                                           opt['parlai_home'], 'data',
                                           'models', 'glove_vectors'))
                elif opt['embedding_type'].startswith('fasttext'):
                    init = 'fasttext'
                    embs = vocab.FastText(language='en',
                                          cache=os.path.join(
                                              opt['parlai_home'], 'data',
                                              'models', 'fasttext_vectors'))
                else:
                    raise RuntimeError('embedding type not implemented')

                if opt['embeddingsize'] != 300:
                    rp = torch.Tensor(300, opt['embeddingsize']).normal_()
                    t = lambda x: torch.mm(x.unsqueeze(0), rp)
                else:
                    t = lambda x: x
                cnt = 0
                for w, i in self.dict.tok2ind.items():
                    if w in embs.stoi:
                        vec = t(embs.vectors[embs.stoi[w]])
                        self.model.decoder.lt.weight.data[i] = vec
                        cnt += 1
                        if opt['lookuptable'] in ['unique', 'dec_out']:
                            # also set encoder lt, since it's not shared
                            self.model.encoder.lt.weight.data[i] = vec
                print('Seq2seq: initialized embeddings for {} tokens from {}.'
                      ''.format(cnt, init))

            if states:
                # set loaded states if applicable
                self.model.load_state_dict(states['model'])

            if self.use_cuda:
                self.model.cuda()

        if hasattr(self, 'model'):
            # if model was built, do more setup
            self.clip = opt.get('gradient_clip', -1)
            self.rank = opt['rank_candidates']

            # set up tensors once
            self.xs = torch.LongTensor(1, 1)
            self.ys = torch.LongTensor(1, 1)
            if self.rank:
                self.cands = torch.LongTensor(1, 1, 1)

            # set up criteria
            if opt.get('numsoftmax', 1) > 1:
                self.criterion = nn.NLLLoss(ignore_index=self.NULL_IDX,
                                            size_average=False)
            else:
                self.criterion = nn.CrossEntropyLoss(
                    ignore_index=self.NULL_IDX, size_average=False)

            if self.use_cuda:
                # push to cuda
                self.xs = self.xs.cuda()
                self.ys = self.ys.cuda()
                if self.rank:
                    self.cands = self.cands.cuda()
                self.criterion.cuda()

            # set up optimizer
            lr = opt['learningrate']
            optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']]
            kwargs = {'lr': lr}
            if opt.get('momentum') > 0 and opt['optimizer'] in [
                    'sgd', 'rmsprop'
            ]:
                kwargs['momentum'] = opt['momentum']
                if opt['optimizer'] == 'sgd':
                    kwargs['nesterov'] = True
            if opt['optimizer'] == 'adam':
                # https://openreview.net/forum?id=ryQu7f-RZ
                kwargs['amsgrad'] = True

            if opt['embedding_type'].endswith('fixed'):
                print('Seq2seq: fixing embedding weights.')
                self.model.decoder.lt.weight.requires_grad = False
                self.model.encoder.lt.weight.requires_grad = False
                if opt['lookuptable'] in ['dec_out', 'all']:
                    self.model.decoder.e2s.weight.requires_grad = False
            self.optimizer = optim_class(
                [p for p in self.model.parameters() if p.requires_grad],
                **kwargs)
            if states.get('optimizer'):
                if states['optimizer_type'] != opt['optimizer']:
                    print('WARNING: not loading optim state since optim class '
                          'changed.')
                else:
                    self.optimizer.load_state_dict(states['optimizer'])
                    if self.use_cuda:
                        for state in self.optimizer.state.values():
                            for k, v in state.items():
                                if isinstance(v, torch.Tensor):
                                    state[k] = v.cuda()
            self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                self.optimizer, 'min', factor=0.5, patience=3, verbose=True)

        self.reset()
def load_dataset(path, args, train=True, build_vocab=True):
    def tokenize(x):
        # Characters to exclude
        exclude = u'"%\'()*+,-./:;<=>[\]^_`{|}~'

        # Remove punctuation
        x = x.translate(str.maketrans('', '', exclude))

        return x.split()

    # Create new fields on vocab
    if build_vocab:
        args.TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
        args.TOPIC = Field(sequential=False, use_vocab=True, lower=True)
        args.LABEL = Field(sequential=False, use_vocab=False)

    if args.dataset == 'haha':
        datafields = [('id', None), ('text', args.TEXT),
                      ('is_humor', args.LABEL), ('votes_no', None),
                      ('votes_1', None), ('votes_2', None), ('votes_3', None),
                      ('votes_4', None), ('votes_5', None),
                      ('funniness_average', None)]
    else:
        datafields = [('id', None), ('topic', args.TOPIC),
                      ('is_ironic', args.LABEL), ('message', args.TEXT)]

    dataset = TabularDataset(path=path,
                             format='CSV',
                             skip_header=True,
                             fields=datafields)

    # build vocaboulary
    if build_vocab:
        if args.fasttext:
            args.TEXT.build_vocab(dataset,
                                  min_freq=2,
                                  vectors=vocab.FastText('es'))
        else:
            args.TEXT.build_vocab(dataset, min_freq=2)

        args.TOPIC.build_vocab(dataset)

        print('vocabulary length : ', len(args.TEXT.vocab))
        print('number of topics : ', len(args.TOPIC.vocab))

    if train:
        # Split dataset
        trn, vld = dataset.split(args.train_percentage)

        target = 'is_humor' if args.dataset == 'haha' else 'is_ironic'

        # study dataset
        vld_num_target = trn_num_target = 0
        for element in trn:
            trn_num_target += int(element.__dict__[target])
        trn_num_not_target = len(trn) - trn_num_target
        for element in vld:
            vld_num_target += int(element.__dict__[target])
        vld_num_not_target = len(vld) - vld_num_target

        # Dataset information
        print('train dataset : {} elements'.format(len(trn)))
        print('train dataset ({}): {} elements. {:.2f}%'.format(
            target, trn_num_target, 100 * trn_num_target / len(trn)))
        print('train dataset (not {}): {} elements. {:.2f}%'.format(
            target, trn_num_not_target, 100 * trn_num_not_target / len(trn)))
        print('validate dataset : {} elements'.format(len(vld)))
        print('validate dataset ({}): {} elements. {:.2f}%'.format(
            target, vld_num_target, 100 * vld_num_target / len(vld)))
        print('validate dataset (not {}): {} elements. {:.2f}%'.format(
            target, vld_num_not_target, 100 * vld_num_not_target / len(vld)))

        return (trn, vld)

    else:  # Return test dataset
        tst = dataset
        target = 'is_humor' if args.dataset == 'haha' else 'is_ironic'

        # study dataset
        tst_num_target = 0
        for element in tst:
            tst_num_target += int(element.__dict__[target])
        tst_num_not_target = len(tst) - tst_num_target

        # Dataset information
        print('test dataset : {} elements'.format(len(tst)))
        print('test dataset ({}): {} elements. {:.2f}%'.format(
            target, tst_num_target, 100 * tst_num_target / len(tst)))
        print('test dataset (not {}): {} elements. {:.2f}%'.format(
            target, tst_num_not_target, 100 * tst_num_not_target / len(tst)))

        return tst
Exemple #11
0
def load_data(config, path_transcripts='/vol/work2/galmant/transcripts/'):
    type_sentence_embedding = config['type_sentence_embedding']
    dev_set_list = config['dev_set_list']
    test_set_list = config['test_set_list']

    punctuations_end_sentence = ['.', '?', '!']
    punctuations = string.punctuation  #['!','(',')',',','-','.','/',':',';','<','=','>','?','[','\\',']','^','_','{','|','}','~'] #!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~

    we = None
    if type_sentence_embedding == 'lstm':
        we = vocab.FastText(language='en')
        '''
        pretrained_aliases = {
            "charngram.100d": partial(CharNGram),
            "fasttext.en.300d": partial(FastText, language="en"),
            "fasttext.simple.300d": partial(FastText, language="simple"),
            "glove.42B.300d": partial(GloVe, name="42B", dim="300"),
            "glove.840B.300d": partial(GloVe, name="840B", dim="300"),
            "glove.twitter.27B.25d": partial(GloVe, name="twitter.27B", dim="25"),
            "glove.twitter.27B.50d": partial(GloVe, name="twitter.27B", dim="50"),
            "glove.twitter.27B.100d": partial(GloVe, name="twitter.27B", dim="100"),
            "glove.twitter.27B.200d": partial(GloVe, name="twitter.27B", dim="200"),
            "glove.6B.50d": partial(GloVe, name="6B", dim="50"),
            "glove.6B.100d": partial(GloVe, name="6B", dim="100"),
            "glove.6B.200d": partial(GloVe, name="6B", dim="200"),
            "glove.6B.300d": partial(GloVe, name="6B", dim="300")
        }
        '''

    #X_all = []
    #Y_all = []
    X_train = []
    Y_train = []
    X_dev = []
    Y_dev = []
    X_test = []
    Y_test = []
    words_set = set()
    for file in sorted(glob.glob(path_transcripts + '*')):
        #TEST
        #for file in [sorted(glob.glob(path_transcripts+'*'))[0]]:
        with open(file, newline='') as csvfile:
            reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
            X_ = []
            Y_ = []
            for row in reader:
                sentence = row[2]
                old_word = row[2]
                for word in row[3:]:
                    if any(punctuation in old_word
                           for punctuation in punctuations_end_sentence
                           ) and word and word[0].isupper():
                        sentence = sentence.strip()
                        n = 0
                        for i, s in enumerate(sentence):
                            if s in punctuations:
                                sentence_ = list(sentence)
                                sentence_.insert(i + n + 1, ' ')
                                sentence_.insert(i + n, ' ')
                                sentence = ''.join(sentence_)
                                n += 2
                        #print(sentence)
                        X_.append(sentence)
                        Y_.append(row[1])
                        sentence = word
                    else:
                        sentence += ' ' + word
                    old_word = word
                if sentence and row[1]:
                    sentence = sentence.strip()
                    n = 0
                    for i, s in enumerate(sentence):
                        if s in punctuations:
                            sentence_ = list(sentence)
                            sentence_.insert(i + n + 1, ' ')
                            sentence_.insert(i + n, ' ')
                            sentence = ''.join(sentence_)
                            n += 2
                    #print(sentence)
                    X_.append(sentence)
                    Y_.append(row[1])
            Y = [s.lower() for s in Y_]
            if type_sentence_embedding == 'lstm':
                X = [s.lower().split() for s in X_]
                #Y = [s.lower() for s in Y_]
                to_del = []
                for s in X:
                    for w in s:
                        if w not in we.stoi:
                            to_del.append(w)
                X = [[w.strip() for w in s if w not in to_del] for s in X]
                for words_per_sentence in X:
                    words_set = words_set.union(set(words_per_sentence))
            else:
                X = X_
                Y = Y  #_
            if len(X) > 0 and len(Y) > 0:
                names_episode = file.split('/')[-1]
                names_season = '.'.join(names_episode.split('.')[:-1])
                names_serie = '.'.join(names_episode.split('.')[0])
                if names_episode in dev_set_list or names_season in dev_set_list or names_serie in dev_set_list:
                    X_dev.append(X)
                    Y_dev.append(Y)
                elif names_episode in test_set_list or names_season in test_set_list or names_serie in test_set_list:
                    X_test.append(X)
                    Y_test.append(Y)
                else:
                    X_train.append(X)
                    Y_train.append(Y)
            assert len(X) == len(Y)
    '''threshold_train_dev = int(len(X_all)*0.8)
    threshold_dev_test = threshold_train_dev + int(len(X_all)*0.1)
    X_train = X_all[:threshold_train_dev]
    Y_train = Y_all[:threshold_train_dev]
    X_dev = X_all[threshold_train_dev:threshold_dev_test]
    Y_dev = Y_all[threshold_train_dev:threshold_dev_test]
    X_test = X_all[threshold_dev_test:]
    Y_test = Y_all[threshold_dev_test:]'''
    #TEST
    #X_train = X_test
    #Y_train = Y_test
    #X_dev = X_test
    #Y_dev = Y_test
    #print('X_train',X_train[-1])
    #time.sleep(60)
    return X_train, Y_train, X_dev, Y_dev, X_test, Y_test, words_set, we
Exemple #12
0
def load_data_new(config, path_transcripts='/vol/work3/maurice/Transcripts/'):
    type_sentence_embedding = config['type_sentence_embedding']
    dev_set_list = config['dev_set_list']
    test_set_list = config['test_set_list']

    punctuations_end_sentence = ['.', '?', '!']
    punctuations = string.punctuation  #['!','(',')',',','-','.','/',':',';','<','=','>','?','[','\\',']','^','_','{','|','}','~'] #!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~

    we = None
    if type_sentence_embedding == 'lstm':
        we = vocab.FastText(language='en')
        '''
        pretrained_aliases = {
            "charngram.100d": partial(CharNGram),
            "fasttext.en.300d": partial(FastText, language="en"),
            "fasttext.simple.300d": partial(FastText, language="simple"),
            "glove.42B.300d": partial(GloVe, name="42B", dim="300"),
            "glove.840B.300d": partial(GloVe, name="840B", dim="300"),
            "glove.twitter.27B.25d": partial(GloVe, name="twitter.27B", dim="25"),
            "glove.twitter.27B.50d": partial(GloVe, name="twitter.27B", dim="50"),
            "glove.twitter.27B.100d": partial(GloVe, name="twitter.27B", dim="100"),
            "glove.twitter.27B.200d": partial(GloVe, name="twitter.27B", dim="200"),
            "glove.6B.50d": partial(GloVe, name="6B", dim="50"),
            "glove.6B.100d": partial(GloVe, name="6B", dim="100"),
            "glove.6B.200d": partial(GloVe, name="6B", dim="200"),
            "glove.6B.300d": partial(GloVe, name="6B", dim="300")
        }
        '''

    #X_all = []
    #Y_all = []
    X_train = []
    Y_train = []
    X_dev = []
    Y_dev = []
    X_test = []
    Y_test = []
    words_set = set()
    threads = []
    for file in sorted(glob.glob(path_transcripts + '*/*')):
        #TEST
        #for file in [sorted(glob.glob(path_transcripts+'*'))[0]]:
        process = Thread(target=read_file,
                         args=[
                             file, punctuations_end_sentence, words_set,
                             dev_set_list, test_set_list, X_train, Y_train,
                             X_dev, Y_dev, X_test, Y_test,
                             type_sentence_embedding, we
                         ])
        process.start()
        threads.append(process)

    for process in threads:
        process.join()

    print(words_set)
    '''threshold_train_dev = int(len(X_all)*0.8)
    threshold_dev_test = threshold_train_dev + int(len(X_all)*0.1)
    X_train = X_all[:threshold_train_dev]
    Y_train = Y_all[:threshold_train_dev]
    X_dev = X_all[threshold_train_dev:threshold_dev_test]
    Y_dev = Y_all[threshold_train_dev:threshold_dev_test]
    X_test = X_all[threshold_dev_test:]
    Y_test = Y_all[threshold_dev_test:]'''
    #TEST
    #X_train = X_test
    #Y_train = Y_test
    #X_dev = X_test
    #Y_dev = Y_test
    #print('X_train',X_train[-1])
    #time.sleep(60)
    return X_train, Y_train, X_dev, Y_dev, X_test, Y_test, words_set, we
Exemple #13
0
    def load_data(self,
                  loader,
                  custom_preprocessing: data.Pipeline = DEFAULT_DATA_PIPELINE,
                  verbose=True):

        self.verbose = verbose

        if self.verbose:
            # create an image folder
            self.img_stats_folder = os.path.join(self.data_path, 'stats')
            create_dir_if_necessary(self.img_stats_folder)

        self.logger.info(
            f'Getting {self.pretrained_word_embeddings} with dimension {self.pretrained_word_embeddings_dim}'
        )
        word_vectors: vocab
        word_vectors = None
        if self.pretrained_word_embeddings == 'glove':
            word_vectors = vocab.GloVe(
                name=self.pretrained_word_embeddings_name,
                dim=self.pretrained_word_embeddings_dim)
        elif self.pretrained_word_embeddings == 'fasttext':
            word_vectors = vocab.FastText(language=self.language)
        self.logger.info('Word vectors successfully loaded.')

        self.logger.debug('Start loading dataset')
        self.dataset = loader(self.name, word_vectors, self.configuration,
                              self.batch_size, self.data_path, self.train_file,
                              self.valid_file, self.test_file, self.use_cuda,
                              self.verbose)

        self.vocabs = self.dataset['vocabs']
        self.task = self.dataset['task']
        self.ds_stats = self.dataset['stats']
        self.split_length = self.dataset['split_length']
        self.train_iter, self.valid_iter, self.test_iter = self.dataset[
            'iters']
        self.fields = self.dataset['fields']
        self.target = self.dataset['target']
        self.target_names = [n for n, _ in self.target]
        self.examples = self.dataset['examples']
        self.embedding = self.dataset['embeddings']
        self.dummy_input = self.dataset['dummy_input']
        self.source_field_name = self.dataset['source_field_name']
        self.target_field_name = self.dataset['target_field_name']
        self.padding_field_name = self.dataset['padding_field_name']
        self.baselines = self.dataset['baselines']

        self.target_size = len(self.vocabs[self.target_vocab_index])
        self.source_embedding = self.embedding[self.source_index]
        self.class_labels = list(self.vocabs[self.target_vocab_index].itos)

        self.source_reverser = self.dataset['source_field']
        self.target_reverser = self.target[0]
        self.log_parameters()

        if verbose:
            # sns.set(style="whitegrid")
            sns.set_style("white")
            sns.despine()

            sns.set_color_codes()
            # sns.set_context("paper")
            sns.set(rc={"font.size": 18, "axes.labelsize": 22})
            # sns.set(font_scale=1.7)
            self.show_stats()
        else:
            self._calculate_dataset_stats()

        self.logger.info('Dataset loaded. Ready for training')
Exemple #14
0
from data import vocab
import torchtext.vocab as Vocab
import os
from data import DATA_ROOT
from data import train_iter, test_iter
import time

# 判定是否能用GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 设置词嵌入维度、隐藏层神经元数量、隐藏层数量
embed_size, num_hiddens, num_layers = 300, 100, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)
# 加载维基百科预训练词向量(使用fasttext),cache为保存目录
fasttext_vocab = Vocab.FastText(cache=os.path.join(DATA_ROOT, "fasttext"))


def load_pretrained_embedding(words, pretrained_vocab):
    """从预训练好的vocab中提取出words对应的词向量"""
    # 初始化为0
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0])
    oov_count = 0  # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
Exemple #15
0
    parser.add_argument('--optim',
                        help='optimizer, Adadelta, Adam or SGD',
                        default='adadelta')
    parser.add_argument(
        '--debug',
        help='debugging mode, only use dev set, not enabled if set 0.',
        default=1,
        type=int)

    args = parser.parse_args()

    # load pre-trained word embeddings
    if args.embedding.lower() == 'glove':
        pretrained_embeddings = vocab.GloVe(name='42B')
    elif args.embedding.lower() == 'fasttext':
        pretrained_embeddings = vocab.FastText(max_vectors=500000)
    else:
        if not os.path.exists('model/GoogleNews-vectors-negative300.bin.gz'):
            os.system(
                'wget https://drive.google.com/uc?export=download&confirm=irnl&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM'
            )
        pretrained_embeddings = load_w2v_vectors(
            'model/GoogleNews-vectors-negative300.bin.gz')

    # prepare dataset
    logger.info('Preparing dataset...')
    train_data = data.Dataset(read_data('data/topicclass/topicclass_train.txt',
                                        FIELDS),
                              fields=FIELDS)
    valid_data = data.Dataset(read_data('data/topicclass/topicclass_valid.txt',
                                        FIELDS),