def __init__(self, vocab, config): word2id = vocab.word2idx super(Model, self).__init__() vocab_num = len(word2id) self.word2id = word2id self.config = config self.char_dict = preprocess.get_char_dict('data/char_vocab.english.txt') self.genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])} self.device = torch.device("cuda:" + config.cuda) self.emb = nn.Embedding(vocab_num, 350) emb1 = EmbedLoader().load_with_vocab(config.glove, vocab,normalize=False) emb2 = EmbedLoader().load_with_vocab(config.turian, vocab ,normalize=False) pre_emb = np.concatenate((emb1, emb2), axis=1) pre_emb /= (np.linalg.norm(pre_emb, axis=1, keepdims=True) + 1e-12) if pre_emb is not None: self.emb.weight = nn.Parameter(torch.from_numpy(pre_emb).float()) for param in self.emb.parameters(): param.requires_grad = False self.emb_dropout = nn.Dropout(inplace=True) if config.use_elmo: self.elmo = ElmoEmbedder(options_file='data/elmo/elmo_2x4096_512_2048cnn_2xhighway_options.json', weight_file='data/elmo/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5', cuda_device=int(config.cuda)) print("elmo load over.") self.elmo_args = torch.randn((3), requires_grad=True).to(self.device) self.char_emb = nn.Embedding(len(self.char_dict), config.char_emb_size) self.conv1 = nn.Conv1d(config.char_emb_size, 50, 3) self.conv2 = nn.Conv1d(config.char_emb_size, 50, 4) self.conv3 = nn.Conv1d(config.char_emb_size, 50, 5) self.feature_emb = nn.Embedding(config.span_width, config.feature_size) self.feature_emb_dropout = nn.Dropout(p=0.2, inplace=True) self.mention_distance_emb = nn.Embedding(10, config.feature_size) self.distance_drop = nn.Dropout(p=0.2, inplace=True) self.genre_emb = nn.Embedding(7, config.feature_size) self.speaker_emb = nn.Embedding(2, config.feature_size) self.bilstm = VarLSTM(input_size=350+150*config.use_CNN+config.use_elmo*1024,hidden_size=200,bidirectional=True,batch_first=True,hidden_dropout=0.2) # self.bilstm = nn.LSTM(input_size=500, hidden_size=200, bidirectional=True, batch_first=True) self.h0 = nn.init.orthogonal_(torch.empty(2, 1, 200)).to(self.device) self.c0 = nn.init.orthogonal_(torch.empty(2, 1, 200)).to(self.device) self.bilstm_drop = nn.Dropout(p=0.2, inplace=True) self.atten = ffnn(input_size=400, hidden_size=config.atten_hidden_size, output_size=1) self.mention_score = ffnn(input_size=1320, hidden_size=config.mention_hidden_size, output_size=1) self.sa = ffnn(input_size=3980+40*config.use_metadata, hidden_size=config.sa_hidden_size, output_size=1) self.mention_start_np = None self.mention_end_np = None
def process(self, paths, train_ds: Iterable[str] = None, src_vocab_op: VocabularyOption = None, tgt_vocab_op: VocabularyOption = None, src_embed_op: EmbeddingOption = None): input_name, target_name = 'words', 'target' src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary( **src_vocab_op) tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op) info = DataBundle(datasets=self.load(paths)) _train_ds = [info.datasets[name] for name in train_ds ] if train_ds else info.datasets.values() src_vocab.from_dataset(*_train_ds, field_name=input_name) tgt_vocab.from_dataset(*_train_ds, field_name=target_name) src_vocab.index_dataset(*info.datasets.values(), field_name=input_name, new_field_name=input_name) tgt_vocab.index_dataset(*info.datasets.values(), field_name=target_name, new_field_name=target_name) info.vocabs = {input_name: src_vocab, target_name: tgt_vocab} if src_embed_op is not None: src_embed_op.vocab = src_vocab init_emb = EmbedLoader.load_with_vocab(**src_embed_op) info.embeddings[input_name] = init_emb for name, dataset in info.datasets.items(): dataset.set_input(input_name) dataset.set_target(target_name) return info
def word_to_id(glove_data, glove_matrix, vocab_dict_path, file_path): if os.path.exists(glove_data) == False or os.path.exists( glove_matrix) == False: data, feature_words, user_num, item_num, = feature_word(file_path) vocab = Vocabulary(max_size=len(feature_words) + 1, unknown='unk', padding='PAD') vocab.add_word_lst(feature_words) vocab.build_vocab() matrix = EmbedLoader.load_with_vocab(vocab_dict_path, vocab) matrix = torch.tensor(matrix) for d in range(len(data)): review = [] for word in data[d]['reviewText']: review.append(vocab.to_index(word)) data[d]['reviewText'] = review with open(glove_data, 'wb') as f: pickle.dump(data, f) with open(glove_matrix, 'wb') as f: pickle.dump(matrix, f) with open(glove_data, 'rb') as f: glove_data = pickle.load(f) with open(glove_matrix, 'rb') as f: matrix = pickle.load(f) return glove_data, matrix, len(glove_data[0]['reviewText'])
def process(self, paths: Union[str, Dict[str, str]], src_vocab_opt: VocabularyOption = None, tgt_vocab_opt: VocabularyOption = None, src_embed_opt: EmbeddingOption = None, char_level_op=False): datasets = {} info = DataBundle() paths = check_dataloader_paths(paths) for name, path in paths.items(): dataset = self.load(path) datasets[name] = dataset def wordtochar(words): chars = [] for word in words: word = word.lower() for char in word: chars.append(char) chars.append('') chars.pop() return chars if char_level_op: for dataset in datasets.values(): dataset.apply_field(wordtochar, field_name="words", new_field_name='chars') datasets["train"], datasets["dev"] = datasets["train"].split( 0.1, shuffle=False) src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary( **src_vocab_opt) src_vocab.from_dataset(datasets['train'], field_name='words') src_vocab.index_dataset(*datasets.values(), field_name='words') tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) tgt_vocab.from_dataset(datasets['train'], field_name='target') tgt_vocab.index_dataset(*datasets.values(), field_name='target') info.vocabs = {"words": src_vocab, "target": tgt_vocab} info.datasets = datasets if src_embed_opt is not None: embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab) info.embeddings['words'] = embed for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def load_conll_with_glove( data_dir, data_path='train.pos', glove_path="", # glove_path='/remote-home/ygxu/dataset/glove.empty.txt', load_glove=True, vocabs=None): path = os.path.join(data_dir, data_path) print(f"start load dataset from {path}.") from dataset import MyConllLoader ds = MyConllLoader().load(path) print(ds) ds.rename_field('word_seq', 'sentence') ds.rename_field('label_seq', 'label') #ds = DataSet.read_pos(path, headers=('sentence', 'label'), sep='\t') #ds.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') #ds.apply(lambda x: x['sentence'].strip().split(), new_field_name='sentence') ds.apply(lambda x: len(x['sentence']) * [1.], new_field_name='word_seq_origin_len', is_input=True) if vocabs is None: vocab = Vocabulary(max_size=30000, min_freq=2, unknown='<unk>', padding='<pad>') ds.apply(lambda x: [vocab.add(word) for word in x['sentence']]) vocab.build_vocab() vocab_label = Vocabulary(max_size=200, unknown=None, padding='<pad>') ds.apply(lambda x: [vocab_label.add(label) for label in x['label']]) vocab_label.build_vocab() else: vocab, vocab_label = vocabs ds.apply(lambda x: [vocab.to_index(w) for w in x['sentence']], new_field_name='word_seq', is_input=True) ds.apply(lambda x: [vocab_label.to_index(w) for w in x['label']], new_field_name='truth', is_input=True, is_target=True) if not load_glove: print(f"successful load dataset from {path}") return ds embedding, _ = EmbedLoader().load_embedding(300, glove_path, 'glove', vocab) print(f"successful load dataset and embedding from {path}") return ds, embedding, (vocab, vocab_label)
def train(path): # test saving pipeline save_pipe(path) embed = EmbedLoader.fast_load_embedding(model_args['word_emb_dim'], emb_file_name, word_v) embed = torch.tensor(embed, dtype=torch.float32) # embed = EmbedLoader.fast_load_embedding(emb_dim=model_args['word_emb_dim'], emb_file=emb_file_name, vocab=word_v) # embed = torch.tensor(embed, dtype=torch.float32) # model.word_embedding = torch.nn.Embedding.from_pretrained(embed, freeze=True) model.word_embedding.padding_idx = word_v.padding_idx model.word_embedding.weight.data[word_v.padding_idx].fill_(0) model.pos_embedding.padding_idx = pos_v.padding_idx model.pos_embedding.weight.data[pos_v.padding_idx].fill_(0) class MyCallback(Callback): def on_step_end(self, optimizer): step = self.trainer.step # learning rate decay if step > 0 and step % 1000 == 0: for pg in optimizer.param_groups: pg['lr'] *= 0.93 print('decay lr to {}'.format( [pg['lr'] for pg in optimizer.param_groups])) if step == 3000: # start training embedding print('start training embedding at {}'.format(step)) model = self.trainer.model for m in model.modules(): if isinstance(m, torch.nn.Embedding): m.weight.requires_grad = True # Trainer trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=ParserLoss(), metrics=ParserMetric(), metric_key='UAS', **train_args.data, optimizer=fastNLP.Adam(**optim_args.data), save_path=path, callbacks=[MyCallback()]) # Start training try: trainer.train() print("Training finished!") finally: # save pipeline save_pipe(path) print('pipe saved')
def load_dataset_with_glove(data_dir, data_path='mr.task.train', glove_path="", load_glove=True, vocabs=None): path = os.path.join(data_dir, data_path) print(f"start load dataset from {path}.") ds = DataSet.read_csv(path, headers=('label', 'sentence'), sep='\t') ds.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') ds.apply(lambda x: x['sentence'].strip().split(), new_field_name='sentence') ds.apply(lambda x: len(x['sentence']) * [1.], new_field_name='mask', is_input=True) ds.apply(lambda x: int(x['label']), new_field_name='label', is_target=True) if vocabs is None: vocab = Vocabulary(max_size=30000, min_freq=2, unknown='<unk>', padding='<pad>') ds.apply(lambda x: [vocab.add(word) for word in x['sentence']]) vocab.build_vocab() else: vocab = vocabs ds.apply(lambda x: [vocab.to_index(w) for w in x['sentence']], new_field_name='data', is_input=True) if not load_glove: print(f"successful load dataset from {path}") return ds embedding, _ = EmbedLoader().load_embedding(300, glove_path, 'glove', vocab) print(f"successful load dataset and embedding from {path}") return ds, embedding, vocab
def process(self, paths: Union[str, Dict[str, str]], src_vocab_opt: VocabularyOption = None, tgt_vocab_opt: VocabularyOption = None, src_embed_opt: EmbeddingOption = None): paths = check_dataloader_paths(paths) datasets = {} info = DataBundle() for name, path in paths.items(): dataset = self.load(path) datasets[name] = dataset src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt) src_vocab.from_dataset(datasets['train'], field_name='words') src_vocab.index_dataset(*datasets.values(), field_name='words') tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) tgt_vocab.from_dataset(datasets['train'], field_name='target') tgt_vocab.index_dataset(*datasets.values(), field_name='target') info.vocabs = { "words": src_vocab, "target": tgt_vocab } info.datasets = datasets if src_embed_opt is not None: embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab) info.embeddings['words'] = embed for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--method", default='cnn', help="train model and test it", choices=['cnn', 'cnn_glove', 'rnn', 'rnn_maxpool', 'rnn_avgpool']) parser.add_argument("--dataset", default='1', help="1: small dataset; 2: big dataset", choices=['1', '2']) args = parser.parse_args() # 超参数 embedding_dim = 256 batch_size = 32 # RNN hidden_dim = 256 # CNN kernel_sizes = (3, 4, 5) num_channels = (120, 160, 200) acti_function = 'relu' learning_rate = 1e-3 train_patience = 8 cate_num = 4 # GloVe embedding_file_path = "glove.6B.100d.txt" device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') vocab = read_vocab("vocab.txt") print("vocabulary length:", len(vocab)) train_data = DataSet().load("train_set") dev_data = DataSet().load("dev_set") test_data = DataSet().load("test_set") if (args.dataset == '1'): cate_num = 4 num_channels = (48, 48, 48) embedding_dim = 128 hidden_dim = 128 elif (args.dataset == '2'): cate_num = 20 if (args.method == 'cnn'): model = TextCNN(vocab_size=len(vocab), embedding_dim=embedding_dim, kernel_sizes=kernel_sizes, num_channels=num_channels, num_classes=cate_num, activation=acti_function) elif (args.method == 'cnn_glove'): glove_embedding = EmbedLoader.load_with_vocab(embedding_file_path, vocab) embedding_dim = glove_embedding.shape[1] print("GloVe embedding_dim:", embedding_dim) model = TextCNN_glove(vocab_size=len(vocab), embedding_dim=embedding_dim, kernel_sizes=kernel_sizes, num_channels=num_channels, num_classes=cate_num, activation=acti_function) model.embedding.load_state_dict( {"weight": torch.from_numpy(glove_embedding)}) model.constant_embedding.load_state_dict( {"weight": torch.from_numpy(glove_embedding)}) model.constant_embedding.weight.requires_grad = False model.embedding.weight.requires_grad = True elif (args.method == 'rnn'): embedding_dim = 128 hidden_dim = 128 model = BiRNNText(vocab_size=len(vocab), embedding_dim=embedding_dim, output_dim=cate_num, hidden_dim=hidden_dim) elif (args.method == 'rnn_maxpool'): model = BiRNNText_pool(vocab_size=len(vocab), embedding_dim=embedding_dim, output_dim=cate_num, hidden_dim=hidden_dim, pool_name="max") elif (args.method == 'rnn_avgpool'): model = BiRNNText_pool(vocab_size=len(vocab), embedding_dim=embedding_dim, output_dim=cate_num, hidden_dim=hidden_dim, pool_name="avg") tester = Tester(test_data, model, metrics=AccuracyMetric()) trainer = Trainer( train_data=train_data, model=model, loss=CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET), metrics=AccuracyMetric(), n_epochs=80, batch_size=batch_size, print_every=10, validate_every=-1, dev_data=dev_data, optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate), check_code_level=2, metric_key='acc', use_tqdm=True, callbacks=[EarlyStopCallback(train_patience)], device=device, ) trainer.train() tester.test()
tag_v = Vocabulary(need_default=False) train_data = loader.load(os.path.join(datadir, train_data_name)) dev_data = loader.load(os.path.join(datadir, dev_data_name)) test_data = loader.load(os.path.join(datadir, test_data_name)) train_data.update_vocab(word_seq=word_v, pos_seq=pos_v, head_labels=tag_v) datasets = (train_data, dev_data, test_data) save_data(processed_datadir, word_v=word_v, pos_v=pos_v, tag_v=tag_v, train_data=train_data, dev_data=dev_data, test_data=test_data) embed, _ = EmbedLoader.load_embedding( model_args['word_emb_dim'], emb_file_name, 'glove', word_v, os.path.join(processed_datadir, 'word_emb.pkl')) print(len(word_v)) print(embed.size()) # Model model_args['word_vocab_size'] = len(word_v) model_args['pos_vocab_size'] = len(pos_v) model_args['num_label'] = len(tag_v) model = BiaffineParser(**model_args.data) model.reset_parameters() datasets = (train_data, dev_data, test_data) for ds in datasets: ds.index_field("word_seq", word_v).index_field("pos_seq",
def process(self, paths: Union[str, Dict[str, str]], char_vocab_opt: VocabularyOption = None, char_embed_opt: EmbeddingOption = None, bigram_vocab_opt: VocabularyOption = None, bigram_embed_opt: EmbeddingOption = None, L: int = 4): """ 支持的数据格式为一行一个sample,并且用空格隔开不同的词语。例如 Option:: 共同 创造 美好 的 新 世纪 —— 二○○一年 新年 贺词 ( 二○○○年 十二月 三十一日 ) ( 附 图片 1 张 ) 女士 们 , 先生 们 , 同志 们 , 朋友 们 : paths支持两种格式,第一种是str,第二种是Dict[str, str]. Option:: # 1. str类型 # 1.1 传入具体的文件路径 data = SigHanLoader('bmes').process('/path/to/cws/data.txt') # 将读取data.txt的内容 # 包含以下的内容data.vocabs['chars']:Vocabulary对象, # data.vocabs['target']: Vocabulary对象,根据encoding_type可能会没有该值 # data.embeddings['chars']: Embedding对象. 只有提供了预训练的词向量的路径才有该项 # data.datasets['train']: DataSet对象 # 包含的field有: # raw_chars: list[str], 每个元素是一个汉字 # chars: list[int], 每个元素是汉字对应的index # target: list[int], 根据encoding_type有对应的变化 # 1.2 传入一个目录, 里面必须包含train.txt文件 data = SigHanLoader('bmes').process('path/to/cws/') #将尝试在该目录下读取 train.txt, test.txt以及dev.txt # 包含以下的内容data.vocabs['chars']: Vocabulary对象 # data.vocabs['target']:Vocabulary对象 # data.embeddings['chars']: 仅在提供了预训练embedding路径的情况下,为Embedding对象; # data.datasets['train']: DataSet对象 # 包含的field有: # raw_chars: list[str], 每个元素是一个汉字 # chars: list[int], 每个元素是汉字对应的index # target: list[int], 根据encoding_type有对应的变化 # data.datasets['dev']: DataSet对象,如果文件夹下包含了dev.txt;内容与data.datasets['train']一样 # 2. dict类型, key是文件的名称,value是对应的读取路径. 必须包含'train'这个key paths = {'train': '/path/to/train/train.txt', 'test':'/path/to/test/test.txt', 'dev':'/path/to/dev/dev.txt'} data = SigHanLoader(paths).process(paths) # 结果与传入目录时是一致的,但是可以传入多个数据集。data.datasets中的key将与这里传入的一致 :param paths: 支持传入目录,文件路径,以及dict。 :param char_vocab_opt: 用于构建chars的vocabulary参数,默认为min_freq=2 :param char_embed_opt: 用于读取chars的Embedding的参数,默认不读取pretrained的embedding :param bigram_vocab_opt: 用于构建bigram的vocabulary参数,默认不使用bigram, 仅在指定该参数的情况下会带有bigrams这个field。 为List[int], 每个instance长度与chars一样, abcde的bigram为ab bc cd de e<eos> :param bigram_embed_opt: 用于读取预训练bigram的参数,仅在传入bigram_vocab_opt有效 :param L: 当target_type为shift_relay时传入的segment长度 :return: """ # 推荐大家使用这个check_data_loader_paths进行paths的验证 paths = check_dataloader_paths(paths) datasets = {} data = DataBundle() bigram = bigram_vocab_opt is not None for name, path in paths.items(): dataset = self.load(path, bigram=bigram) datasets[name] = dataset input_fields = [] target_fields = [] # 创建vocab char_vocab = Vocabulary( min_freq=2) if char_vocab_opt is None else Vocabulary( **char_vocab_opt) char_vocab.from_dataset(datasets['train'], field_name='raw_chars') char_vocab.index_dataset(*datasets.values(), field_name='raw_chars', new_field_name='chars') data.vocabs[Const.CHAR_INPUT] = char_vocab input_fields.extend([Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET]) target_fields.append(Const.TARGET) # 创建target if self.target_type == 'bmes': target_vocab = Vocabulary(unknown=None, padding=None) target_vocab.add_word_lst(['B'] * 4 + ['M'] * 3 + ['E'] * 2 + ['S']) target_vocab.index_dataset(*datasets.values(), field_name='target') data.vocabs[Const.TARGET] = target_vocab if char_embed_opt is not None: char_embed = EmbedLoader.load_with_vocab(**char_embed_opt, vocab=char_vocab) data.embeddings['chars'] = char_embed if bigram: bigram_vocab = Vocabulary(**bigram_vocab_opt) bigram_vocab.from_dataset(datasets['train'], field_name='bigrams') bigram_vocab.index_dataset(*datasets.values(), field_name='bigrams') data.vocabs['bigrams'] = bigram_vocab if bigram_embed_opt is not None: bigram_embed = EmbedLoader.load_with_vocab(**bigram_embed_opt, vocab=bigram_vocab) data.embeddings['bigrams'] = bigram_embed input_fields.append('bigrams') if self.target_type == 'shift_relay': func = partial(self._clip_target, L=L) for name, dataset in datasets.items(): res = dataset.apply_field(func, field_name='target') relay_target = [res_i[0] for res_i in res] relay_mask = [res_i[1] for res_i in res] dataset.add_field('relay_target', relay_target, is_input=True, is_target=False, ignore_type=False) dataset.add_field('relay_mask', relay_mask, is_input=True, is_target=False, ignore_type=False) if self.target_type == 'shift_relay': input_fields.extend(['end_seg_mask']) target_fields.append('start_seg_mask') # 将dataset加入DataInfo for name, dataset in datasets.items(): dataset.set_input(*input_fields) dataset.set_target(*target_fields) data.datasets[name] = dataset return data
def main(): parser = argparse.ArgumentParser(description='Summarization Model') # Where to find data parser.add_argument( '--data_path', type=str, default='/remote-home/dqwang/Datasets/CNNDM/train.label.jsonl', help='Path expression to pickle datafiles.') parser.add_argument( '--valid_path', type=str, default='/remote-home/dqwang/Datasets/CNNDM/val.label.jsonl', help='Path expression to pickle valid datafiles.') parser.add_argument('--vocab_path', type=str, default='/remote-home/dqwang/Datasets/CNNDM/vocab', help='Path expression to text vocabulary file.') # Important settings parser.add_argument('--mode', choices=['train', 'test'], default='train', help='must be one of train/test') parser.add_argument('--embedding', type=str, default='glove', choices=['word2vec', 'glove', 'elmo', 'bert'], help='must be one of word2vec/glove/elmo/bert') parser.add_argument('--sentence_encoder', type=str, default='transformer', choices=['bilstm', 'deeplstm', 'transformer'], help='must be one of LSTM/Transformer') parser.add_argument('--sentence_decoder', type=str, default='SeqLab', choices=['PN', 'SeqLab'], help='must be one of PN/SeqLab') parser.add_argument( '--restore_model', type=str, default='None', help= 'Restore model for further training. [bestmodel/bestFmodel/earlystop/None]' ) # Where to save output parser.add_argument('--save_root', type=str, default='save/', help='Root directory for all model.') parser.add_argument('--log_root', type=str, default='log/', help='Root directory for all logging.') # Hyperparameters parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: -1]') parser.add_argument('--cuda', action='store_true', default=False, help='use cuda') parser.add_argument( '--vocab_size', type=int, default=100000, help= 'Size of vocabulary. These will be read from the vocabulary file in order. If the vocabulary file contains fewer words than this number, or if this number is set to 0, will take all words in the vocabulary file.' ) parser.add_argument('--n_epochs', type=int, default=20, help='Number of epochs [default: 20]') parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 128]') parser.add_argument('--word_embedding', action='store_true', default=True, help='whether to use Word embedding') parser.add_argument('--embedding_path', type=str, default='/remote-home/dqwang/Glove/glove.42B.300d.txt', help='Path expression to external word embedding.') parser.add_argument('--word_emb_dim', type=int, default=300, help='Word embedding size [default: 200]') parser.add_argument( '--embed_train', action='store_true', default=False, help='whether to train Word embedding [default: False]') parser.add_argument('--min_kernel_size', type=int, default=1, help='kernel min length for CNN [default:1]') parser.add_argument('--max_kernel_size', type=int, default=7, help='kernel max length for CNN [default:7]') parser.add_argument('--output_channel', type=int, default=50, help='output channel: repeated times for one kernel') parser.add_argument('--use_orthnormal_init', action='store_true', default=True, help='use orthnormal init for lstm [default: true]') parser.add_argument( '--sent_max_len', type=int, default=100, help='max length of sentences (max source text sentence tokens)') parser.add_argument( '--doc_max_timesteps', type=int, default=50, help='max length of documents (max timesteps of documents)') parser.add_argument('--save_label', action='store_true', default=False, help='require multihead attention') # Training parser.add_argument('--lr', type=float, default=0.0001, help='learning rate') parser.add_argument('--lr_descent', action='store_true', default=False, help='learning rate descent') parser.add_argument('--grad_clip', action='store_true', default=False, help='for gradient clipping') parser.add_argument( '--max_grad_norm', type=float, default=10, help='for gradient clipping max gradient normalization') # test parser.add_argument('-m', type=int, default=3, help='decode summary length') parser.add_argument( '--test_model', type=str, default='evalbestmodel', help= 'choose different model to test [evalbestmodel/evalbestFmodel/trainbestmodel/trainbestFmodel/earlystop]' ) parser.add_argument('--use_pyrouge', action='store_true', default=False, help='use_pyrouge') args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu torch.set_printoptions(threshold=50000) # File paths DATA_FILE = args.data_path VALID_FILE = args.valid_path VOCAL_FILE = args.vocab_path LOG_PATH = args.log_root # # train_log setting if not os.path.exists(LOG_PATH): if args.mode == "train": os.makedirs(LOG_PATH) else: raise Exception( "[Error] Logdir %s doesn't exist. Run in train mode to create it." % (LOG_PATH)) nowTime = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') log_path = os.path.join(LOG_PATH, args.mode + "_" + nowTime) # logger = _init_logger(path=log_path) # file_handler = logging.FileHandler(log_path) # file_handler.setFormatter(formatter) # logger.addHandler(file_handler) logger.info("Pytorch %s", torch.__version__) # dataset hps = args dbPipe = ExtCNNDMPipe(vocab_size=hps.vocab_size, vocab_path=VOCAL_FILE, sent_max_len=hps.sent_max_len, doc_max_timesteps=hps.doc_max_timesteps) if hps.mode == 'test': hps.recurrent_dropout_prob = 0.0 hps.atten_dropout_prob = 0.0 hps.ffn_dropout_prob = 0.0 logger.info(hps) paths = {"test": DATA_FILE} db = dbPipe.process_from_file(paths) else: paths = {"train": DATA_FILE, "valid": VALID_FILE} db = dbPipe.process_from_file(paths) # embedding if args.embedding == "glove": vocab = db.get_vocab("vocab") embed = torch.nn.Embedding(len(vocab), hps.word_emb_dim) if hps.word_embedding: embed_loader = EmbedLoader() pretrained_weight = embed_loader.load_with_vocab( hps.embedding_path, vocab) # unfound with random init embed.weight.data.copy_(torch.from_numpy(pretrained_weight)) embed.weight.requires_grad = hps.embed_train else: logger.error("[ERROR] embedding To Be Continued!") sys.exit(1) # model if args.sentence_encoder == "transformer" and args.sentence_decoder == "SeqLab": model_param = json.load(open("config/transformer.config", "rb")) hps.__dict__.update(model_param) model = TransformerModel(hps, embed) elif args.sentence_encoder == "deeplstm" and args.sentence_decoder == "SeqLab": model_param = json.load(open("config/deeplstm.config", "rb")) hps.__dict__.update(model_param) model = SummarizationModel(hps, embed) else: logger.error("[ERROR] Model To Be Continued!") sys.exit(1) if hps.cuda: model = model.cuda() logger.info("[INFO] Use cuda") logger.info(hps) if hps.mode == 'train': db.get_dataset("valid").set_target("text", "summary") setup_training(model, db.get_dataset("train"), db.get_dataset("valid"), hps) elif hps.mode == 'test': logger.info("[INFO] Decoding...") db.get_dataset("test").set_target("text", "summary") run_test(model, db.get_dataset("test"), hps, limited=hps.limited) else: logger.error("The 'mode' flag must be one of train/eval/test") raise ValueError("The 'mode' flag must be one of train/eval/test")
# In[5]: # 1. get dataset dataset = load_data('data/train.tsv', 1) train_dataset, val_dataset = dataset.split(0.1) test_dataset = load_data('data/test.tsv', 0) print("train_dataset size: ", train_dataset.get_length()) print("val_dataset size: ", val_dataset.get_length()) print("test_dataset size: ", test_dataset.get_length()) # In[6]: # 2. get vocabulary if (use_pretrain): loader = EmbedLoader() pre_embed, vocab = loader.load_without_vocab(embed_path, normalize=False) embedding_size = pre_embed.shape[1] else: vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words') print("vocabulary size: ", len(vocab)) # In[7]: # 3. word to index vocab.index_dataset(train_dataset, field_name='words', new_field_name='words') vocab.index_dataset(val_dataset, field_name='words', new_field_name='words') vocab.index_dataset(test_dataset, field_name='words', new_field_name='words') # ### 3. Build CNN model
def test_case(self): vocab = Vocabulary() vocab.update(["the", "in", "I", "to", "of", "hahaha"]) embedding = EmbedLoader().fast_load_embedding( 50, "test/data_for_tests/glove.6B.50d_test.txt", vocab) self.assertEqual(tuple(embedding.shape), (len(vocab), 50))