def use_glove(): embedding_dim = 300 project_dim = 200 train_reader = StanfordSentimentTreeBankDatasetReader() dev_reader = StanfordSentimentTreeBankDatasetReader(use_subtrees=False) train_dataset = train_reader.read('~/nlp/dataset/sst/trees/train.txt') dev_dataset = dev_reader.read('~/nlp/dataset/sst/trees/dev.txt') print( f"total train samples: {len(train_dataset)}, dev samples: {len(dev_dataset)}" ) # 建立词汇表,从数据集中建立 vocab = Vocabulary.from_instances(train_dataset + dev_dataset) vocab_dim = vocab.get_vocab_size('tokens') print("vocab: ", vocab.get_vocab_size('labels'), vocab_dim) glove_embeddings_file = '~/nlp/pretrainedEmbeddings/glove/glove.840B.300d.txt' # If you want to actually load a pretrained embedding file, # you currently need to do that by calling Embedding.from_params() # see https://github.com/allenai/allennlp/issues/2694 token_embedding = Embedding.from_params(vocab=vocab, params=Params({ 'pretrained_file': glove_embeddings_file, 'embedding_dim': embedding_dim, 'trainable': False })) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) print(word_embeddings.get_output_dim()) # use batch_to_ids to convert sentences to character ids sentence_lists = [["I", 'have', 'a', "dog"], ["How", 'are', 'you', ',', 'today', 'is', "Monday"]] sentence_ids = batch_to_ids(sentence_lists, vocab) embeddings = token_embedding(sentence_ids) for sentence in sentence_lists: for text in sentence: indice = vocab.get_token_index(text) print(f"text: {text}, indice: {indice}") # calculate distance based on elmo embedding import scipy tokens = [["dog", "ate", "an", "apple", "for", "breakfast"]] tokens2 = [["cat", "ate", "an", "carrot", "for", "breakfast"]] token_ids = batch_to_ids(tokens, vocab) token_ids2 = batch_to_ids(tokens2, vocab) vectors = token_embedding(token_ids) vectors2 = token_embedding(token_ids2) print('embedding shape ', vectors.shape) print('\nvector ', vectors[0][0], vectors2[0][0]) distance = scipy.spatial.distance.cosine(vectors[0][0], vectors2[0][0]) print(f"embedding distance: {distance}")
def from_params(cls, vocab: Vocabulary, params: Params) -> 'HierarchicalCRF': embedder_params = params.pop('text_field_embedder') text_field_embedder = TextFieldEmbedder.from_params( vocab, embedder_params) duration_embedder = Embedding.from_params( None, params.pop('duration_embedder')) label_embedder = Embedding.from_params(vocab, params.pop('label_embedder')) inner_encoder = Seq2VecEncoder.from_params(params.pop('inner_encoder')) outer_encoder = Seq2SeqEncoder.from_params(params.pop('outer_encoder')) segment_embedder = SegmentEmbedder.from_params( params.pop('segment_embedder')) weight_function = WeightFunction.from_params( params.pop('weight_function')) label_namespace = params.pop('label_namespace', 'labels') max_length = params.pop_int('max_length', None) dropout = params.pop_float('dropout', None) initializer = InitializerApplicator.from_params( params.pop('initializer', [])) regularizer = RegularizerApplicator.from_params( params.pop('regularizer', [])) params.assert_empty(cls.__name__) return cls(vocab=vocab, text_field_embedder=text_field_embedder, duration_embedder=duration_embedder, label_embedder=label_embedder, inner_encoder=inner_encoder, outer_encoder=outer_encoder, segment_embedder=segment_embedder, weight_function=weight_function, label_namespace=label_namespace, max_length=max_length, dropout=dropout, initializer=initializer, regularizer=regularizer)
def __init__(self, vocab: Vocabulary, params: Params, regularizer: RegularizerApplicator = None): super(JointBiaffine, self).__init__(vocab=vocab, regularizer=regularizer) # Base text Field Embedder text_field_embedder_params = params.pop("text_field_embedder") text_field_embedder = BasicTextFieldEmbedder.from_params( vocab=vocab, params=text_field_embedder_params) self._text_field_embedder = text_field_embedder # Encoder encoder_params = params.pop("encoder") encoder = Seq2SeqEncoder.from_params(encoder_params) self._encoder = encoder self._group_shared_matrix = torch.FloatTensor() self._tag_representation_dim = params.pop('tag_representation_dim') self._arc_representation_dim = params.pop('arc_representation_dim') self._dropout = params.pop('dropout') self._input_dropout = params.pop('input_dropout') ############ # DSP Stuffs ############ dsp_params = params.pop("dsp") init_params = dsp_params.pop("initializer", None) self._initializer = (InitializerApplicator.from_params(init_params) if init_params is not None else InitializerApplicator()) pos_params = dsp_params.pop("pos_tag_embedding") self._pos_tag_embedding = Embedding.from_params(vocab, pos_params) # Tagger DSP - Biaffine Tagger tagger_dsp = BiaffineParser( vocab=vocab, task_type='dsp', text_field_embedder=self._text_field_embedder, encoder=self._encoder, tag_representation_dim=self._tag_representation_dim, arc_representation_dim=self._arc_representation_dim, pos_tag_embedding=self._pos_tag_embedding, dropout=self._dropout, input_dropout=self._input_dropout, initializer=self._initializer) self._tagger_dsp = tagger_dsp # arc shared self._arc_attention = tagger_dsp.arc_attention self._head_arc_feedforward = tagger_dsp.head_arc_feedforward self._child_arc_feedforward = tagger_dsp.child_arc_feedforward ############ # SRL Stuffs ############ srl_params = params.pop("srl") init_params = srl_params.pop("initializer", None) self._initializer = (InitializerApplicator.from_params(init_params) if init_params is not None else InitializerApplicator()) pos_params = srl_params.pop("pos_tag_embedding") self._pos_tag_embedding = Embedding.from_params(vocab, pos_params) # Tagger: EMD - CRF Tagger tagger_srl = BiaffineParser( vocab=vocab, task_type='srl', text_field_embedder=self._text_field_embedder, encoder=self._encoder, tag_representation_dim=self._tag_representation_dim, arc_representation_dim=self._arc_representation_dim, pos_tag_embedding=self._pos_tag_embedding, dropout=self._dropout, input_dropout=self._input_dropout, initializer=self._initializer) tagger_srl.arc_attention = self._arc_attention tagger_srl.head_arc_feedforward = self._head_arc_feedforward tagger_srl.child_arc_feedforward = self._child_arc_feedforward self._tagger_srl = tagger_srl logger.info("Multi-Task Learning Model has been instantiated.")
def probe(self): """Probes an embeddings file and returns its metrics. Trains a linear model using embeddings from _get_embeddings_from_model() as a pretrained embeddings layer and intrinsic data for the current probing task. Returns: A dict containing metrics from allennlp.training.util.evaluate. """ metrics = dict() train, dev, test = self._get_intrinsic_data() # Add test data to vocabulary else evaluation will be unstable vocab = Vocabulary.from_instances(train + dev + test) for callback in self._callbacks: # Add small progress margin to indicate something is happening callback(0.02) self.embeddings_file = self._get_embeddings_from_model() if os.path.splitext(self.embeddings_file)[-1] == '.vec': params = Params({ 'embedding_dim': self._get_embedding_dim(self.embeddings_file), 'pretrained_file': self.embeddings_file, 'trainable': False }) word_embeddings = Embedding.from_params(vocab, params=params) else: word_embeddings = SentenceEmbedding(self.embeddings_file, vocab) if self.probing_task.contrastive: model = ContrastiveLinear(word_embeddings, vocab) else: model = LinspectorLinear(word_embeddings, vocab, self.probing_type) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 optimizer = optim.Adam(model.parameters()) iterator = BasicIterator(batch_size=16) iterator.index_with(vocab) # Use a serialization_dir otherwise evaluation uses last weights instead of best with TemporaryDirectory() as serialization_dir: trainer = LinspectorTrainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train, validation_dataset=dev, patience=5, validation_metric='+accuracy', num_epochs=20, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_clipping=5.0) def trainer_callback(progress): for callback in self._callbacks: # Fill second half of progress with trainer callback callback(0.51 + 0.49 * progress) trainer.subscribe(trainer_callback) trainer.train() metrics = evaluate(trainer.model, test, iterator, cuda_device, batch_weight_key='') os.unlink(self.embeddings_file) self.embeddings_file = None return metrics
def main(): token_indexer = SingleIdTokenIndexer() # train_reader = sstDataReader(token_indexers={'tokens': token_indexer}, use_subtrees=False) # dev_reader = sstDataReader(token_indexers={'tokens': token_indexer}, use_subtrees=False) # dataset_root = Path("~/nlp/dataset/sst/trees") # train_dataset, dev_dataset = (reader.read(dataset_root / fname) for reader, fname in zip([train_reader, dev_reader], ["train.txt", "dev.txt"])) reader = MovieReviewDatasetReader(max_seq_len=200) dataset_root = Path("/home/lirui/nlp/dataset/MRv1.0/rt-polaritydata") train_dataset, dev_dataset = (reader.read(dataset_root / fname) for fname in ["train.pkl", "test.pkl"]) # Kaggle的多标签“恶意评论分类挑战 # reader = JigsawDatasetReader(tokenizer=None, # token_indexers={"tokens": token_indexer}, # max_seq_len=200) # dataset_root = Path('../data/jigsaw') # train_dataset, dev_dataset = (reader.read(dataset_root / fname) for fname in ["train.csv", "test_proced.csv"]) print(f"total train samples: {len(train_dataset)}, dev samples: {len(dev_dataset)}") # 建立词汇表,从数据集中建立 vocab = Vocabulary.from_instances(train_dataset + dev_dataset) vocab_dim = vocab.get_vocab_size('tokens') print("vocab: ", vocab.get_vocab_size('labels'), vocab_dim) embedding_dim = 300 # token_embedding = Embedding(num_embeddings=vocab_dim, embedding_dim=embedding_dim) # 此处与随机embedding不同,glove目前只支持from_params,暂不支持构造函数实现 glove_embeddings_file = '~/nlp/pretrainedEmbeddings/glove/glove.840B.300d.txt' token_embedding = Embedding.from_params(vocab=vocab, params=Params({'pretrained_file': glove_embeddings_file, 'embedding_dim': embedding_dim, 'trainable': False})) text_embedder = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = CnnEncoder(embedding_dim=embedding_dim, num_filters=200, ngram_filter_sizes=(3, 4, 5), ) # model = MultiLabelClassifier(text_embedder, # 0, # encoder, # 0.2, # vocab=vocab, # out_dim=6, # ) model = SSTClassifier(text_embedder, 0.2, encoder, 0.2, vocab.get_vocab_size('labels'), vocab, verbose=False) # sequence_ids = torch.Tensor([[4, 2, 67, 54, 30, 9, 0, 0], [87, 43, 12, 25, 81, 24, 52, 70]]).long() # sequence_ids.cuda() # print("seq bath: ", sequence_ids.size()) # sequence_embedding = token_embedding(sequence_ids) # print("embedding ", sequence_embedding.size()) # y = model({"tokens": sequence_ids}) # print("model out ", y.keys(), y['logits'].shape) # 训练参数 gpu_id = 1 if torch.cuda.is_available() else -1 if gpu_id > -1: model.cuda(gpu_id) # 构建迭代器,并为迭代器指定vocab iterator = BucketIterator(batch_size=128, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) # -------- forward demo --------- # generator = iter(iterator(train_dataset, shuffle=False)) # for _ in range(10): # batch = next(generator) # print('---\nbatch ', batch.keys(), batch['tokens'].keys(), batch['tokens']['tokens'].shape, batch['label'].shape) # [batch, sentence_len, token_len] # batch = move_to_device(batch, gpu_id) # tokens = batch['tokens'] # mask = get_text_field_mask(tokens) # embeddings = model.word_embeddings(tokens) # print("embeddings: ", embeddings.shape) # state = model.encoder(embeddings, mask) # class_logits = model.linear(state) # # print("lstm state: ", state.shape, class_logits.shape) # y = model(**batch) # metric = model.get_metrics() # print("model out: ", y, '\n', metric) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, grad_norm=5.0, # validation_metric='+accuracy', cuda_device=gpu_id, patience=5, num_epochs=20) trainer.train()
def main(): args = get_args() # TODO 增加char n-gram embeddings if args.embedding == 'elmo': token_indexer = ELMoTokenCharactersIndexer() else: token_indexer = SingleIdTokenIndexer() # Kaggle的多标签“恶意评论分类挑战 reader = JigsawDatasetReader(tokenizer=None, token_indexers={"tokens": token_indexer}, max_seq_len=200) dataset_root = Path('../../data/jigsaw') train_dataset, dev_dataset = (reader.read( dataset_root / fname) for fname in ["train.csv", "test_proced.csv"]) print( f"total train samples: {len(train_dataset)}, dev samples: {len(dev_dataset)}" ) # 建立词汇表,从数据集中建立 # if args.embedding == 'elmo': # vocab = Vocabulary() # else: vocab = Vocabulary.from_instances(train_dataset + dev_dataset) vocab_dim = vocab.get_vocab_size('tokens') print("vocab: ", vocab.get_vocab_size('labels'), vocab_dim) # 建立token embedding token_embedding = None print(f"embedding dim: {args.embedding_dim}") if args.embedding == 'random': token_embedding = Embedding(num_embeddings=vocab_dim, embedding_dim=args.embedding_dim) elif args.embedding == 'glove': glove_embeddings_file = '~/nlp/pretrainedEmbeddings/glove/glove.6B.100d.txt' token_embedding = Embedding.from_params(vocab=vocab, params=Params({ 'pretrained_file': glove_embeddings_file, 'embedding_dim': args.embedding_dim, 'trainable': False })) elif args.embedding == 'elmo': # pretrained elmo LM model, transformed from bilm-tf with dump_weights in bin/training.py options_file = '~/nlp/pretrainedEmbeddings/elmo/elmo_2x4096_512_2048cnn_2xhighway_options.json' weight_file = '~/nlp/pretrainedEmbeddings/elmo/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5' token_embedding = ElmoTokenEmbedder(options_file, weight_file, requires_grad=True, do_layer_norm=False) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) if args.embedding == 'elmo': args.embedding_dim = word_embeddings.get_output_dim() # 建立encoder seq2vec if args.encoder == 'lstm': hidden_dim = 256 encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(args.embedding_dim, hidden_dim, bidirectional=True, batch_first=True)) elif args.encoder == 'cnn': encoder = CnnEncoder( embedding_dim=args.embedding_dim, num_filters=128, ngram_filter_sizes=(2, 3, 4, 5, 6, 7), ) else: encoder = None # 建立 主分类网络 if args.network is None: model = MultiLabelClassifier( word_embeddings, 0.5, encoder, 0.2, vocab=vocab, out_dim=6, ) elif args.network == 'bcn': # TODO 转换成code line 形式 实例化分类器网络 bcn_params = { "text_field_embedder": { "token_embedders": { "tokens": { "pretrained_file": "/home/lirui/nlp/document-qa/data/glove/glove.840B.300d.txt", "type": "embedding", "embedding_dim": 300, "trainable": False } } }, "embedding_dropout": 0.5, "pre_encode_feedforward": { "input_dim": 300, "num_layers": 1, "hidden_dims": [300], "activations": ["relu"], "dropout": [0.25] }, "encoder": { "type": "lstm", "input_size": 300, "hidden_size": 300, "num_layers": 1, "bidirectional": True }, "integrator": { "type": "lstm", "input_size": 1800, "hidden_size": 300, "num_layers": 1, "bidirectional": True }, "integrator_dropout": 0.1, # "elmo": { # "options_file": "/home/lirui/nlp/learning_allenNLP/learning_allennlp/models/elmo/elmo_2x4096_512_2048cnn_2xhighway_options.json", # "weight_file": "/home/lirui/nlp/learning_allenNLP/learning_allennlp/models/elmo/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5", # "do_layer_norm": False, # "dropout": 0.0, # "num_output_representations": 1 # }, # "use_input_elmo": False, # "use_integrator_output_elmo": False, "output_layer": { "input_dim": 2400, "num_layers": 3, "output_dims": [1200, 600, 5], "pool_sizes": 4, "dropout": [0.2, 0.3, 0.0] } } model = BiattentiveClassificationNetwork.from_params( vocab, params=Params(bcn_params)) # 训练参数 gpu_id = args.gpu_id if torch.cuda.is_available() else -1 if gpu_id > -1: model.cuda(gpu_id) # 构建迭代器,并为迭代器指定vocab iterator = BucketIterator(batch_size=args.batch_size, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, grad_norm=5.0, # validation_metric='+accuracy', cuda_device=gpu_id, patience=5, num_epochs=args.n_epochs) trainer.train()
def train_main(): train_reader = StanfordSentimentTreeBankDatasetReader(use_subtrees=True) dev_reader = StanfordSentimentTreeBankDatasetReader(use_subtrees=False) train_dataset = train_reader.read('~/nlp/dataset/sst/trees/train.txt') dev_dataset = dev_reader.read('~/nlp/dataset/sst/trees/dev.txt') print( f"total train samples: {len(train_dataset)}, dev samples: {len(dev_dataset)}" ) # 建立词汇表,从数据集中建立 vocab = Vocabulary.from_instances(train_dataset + dev_dataset) vocab_dim = vocab.get_vocab_size('tokens') print("vocab: ", vocab.get_vocab_size('labels'), vocab_dim) # 构建网络,此处网络为lstm-linear embedding_dim = 300 hidden_dim = 128 # 此处与demo_kaggle_jigsaw.py 中的随机embedding不同,glove目前只支持from_params,暂不支持构造函数实现 glove_embeddings_file = '~/nlp/pretrainedEmbeddings/glove/glove.840B.300d.txt' token_embedding = Embedding.from_params(vocab=vocab, params=Params({ 'pretrained_file': glove_embeddings_file, 'embedding_dim': embedding_dim, 'trainable': False })) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)) model = SSTClassifier(word_embeddings, encoder, out_dim=vocab.get_vocab_size("labels"), vocab=vocab) # allennlp 目前好像不支持单机多卡,或者支持性能不好 gpu_id = 0 if torch.cuda.is_available() else -1 if gpu_id > -1: model.cuda(gpu_id) # 构建迭代器,并为迭代器指定vocab iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) # -------- forward demo --------- generator = iter(iterator(train_dataset, shuffle=True)) for _ in range(5): batch = next(generator) print('---\nbatch ', batch.keys(), batch['tokens'].keys(), batch['tokens']['tokens'].shape, batch['label'].shape) # [batch, sentence_len, token_len] batch = nn_util.move_to_device(batch, gpu_id) tokens = batch['tokens'] mask = get_text_field_mask(tokens) embeddings = model.word_embeddings(tokens) print("embeddings: ", embeddings.shape) state = model.encoder(embeddings, mask) class_logits = model.linear(state) print("lstm state: ", state.shape, class_logits.shape) y = model(**batch) metric = model.get_metrics() print("model out: ", y, '\n', metric)
def __init__(self, vocab: Vocabulary, params: Params, regularizer: RegularizerApplicator = None): super(JointDCS, self).__init__(vocab=vocab, regularizer=regularizer) # Base text Field Embedder text_field_embedder_params = params.pop("text_field_embedder") text_field_embedder = BasicTextFieldEmbedder.from_params( vocab=vocab, params=text_field_embedder_params) self._text_field_embedder = text_field_embedder # Encoder encoder_params = params.pop("encoder") encoder = Seq2SeqEncoder.from_params(encoder_params) self._encoder = encoder self._tag_representation_dim = params.pop('tag_representation_dim') self._arc_representation_dim = params.pop('arc_representation_dim') self._dropout = params.pop('dropout') self._input_dropout = params.pop('input_dropout') ############ # DSP Stuffs ############ dsp_params = params.pop("dsp") init_params = dsp_params.pop("initializer", None) self._initializer = (InitializerApplicator.from_params(init_params) if init_params is not None else InitializerApplicator()) pos_params = dsp_params.pop("pos_tag_embedding") self._pos_tag_embedding = Embedding.from_params(vocab, pos_params) # Tagger DSP - Biaffine Tagger tagger_dsp = BiaffineParser( vocab=vocab, task_type='dsp', text_field_embedder=self._text_field_embedder, encoder=self._encoder, tag_representation_dim=self._tag_representation_dim, arc_representation_dim=self._arc_representation_dim, pos_tag_embedding=self._pos_tag_embedding, dropout=self._dropout, input_dropout=self._input_dropout, initializer=self._initializer) self._tagger_dsp = tagger_dsp # arc shared self._arc_attention = tagger_dsp.arc_attention self._head_arc_feedforward = tagger_dsp.head_arc_feedforward self._child_arc_feedforward = tagger_dsp.child_arc_feedforward ############ # SRL Stuffs ############ srl_params = params.pop("srl") # init_params = srl_params.pop("initializer", None) # self._initializer = ( # InitializerApplicator.from_params(init_params) if init_params is not None else InitializerApplicator() # ) # pos_params = srl_params.pop("pos_tag_embedding") # self._pos_tag_embedding = Embedding.from_params(vocab, pos_params) # Tagger: SRL - Biaffine Tagger tagger_srl = BiaffineParser( vocab=vocab, task_type='srl', text_field_embedder=self._text_field_embedder, encoder=self._encoder, tag_representation_dim=self._tag_representation_dim, arc_representation_dim=self._arc_representation_dim, pos_tag_embedding=self._pos_tag_embedding, dropout=self._dropout, input_dropout=self._input_dropout, initializer=self._initializer) tagger_srl.arc_attention = self._arc_attention tagger_srl.head_arc_feedforward = self._head_arc_feedforward tagger_srl.child_arc_feedforward = self._child_arc_feedforward self._tagger_srl = tagger_srl ############ # CSP Stuffs ############ csp_params = params.pop("csp") init_params = csp_params.pop("initializer", None) self._initializer = (InitializerApplicator.from_params(init_params) if init_params is not None else InitializerApplicator()) # pos_params = csp_params.pop("pos_tag_embedding") # self._pos_tag_embedding = Embedding.from_params(vocab, pos_params) span_params = csp_params.pop("span_extractor") self._span_extractor = SpanExtractor.from_params(span_params) feed_forward_params = csp_params.pop("feedforward") self._feed_forward = FeedForward.from_params(feed_forward_params) # Tagger: CSP - SpanConstituencyParser Tagger tagger_csp = SpanConstituencyParser( vocab=vocab, text_field_embedder=self._text_field_embedder, span_extractor=self._span_extractor, encoder=self._encoder, feedforward=self._feed_forward, pos_tag_embedding=self._pos_tag_embedding, initializer=self._initializer) self._tagger_csp = tagger_csp logger.info("Multi-Task Learning Model has been instantiated.")