def build_model(options): model = Seq2Seq.load(ImageCaptioning, options.model_path, tok_dir=options.tokenizer_path, use_obj=options.obj) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) num_gpu = torch.cuda.device_count() generator = BeamDecoder(model, beam_width=options.beam_width, max_len_a=options.max_len_a, max_len_b=options.max_len_b, len_penalty_ratio=options.len_penalty_ratio) if options.fp16: generator = amp.initialize(generator, opt_level="O2") if num_gpu > 1: generator = DataParallelModel(generator) return generator, model.text_processor
[text_processor.lang_id(sentences[sid].strip().split(" ")[0])]) yield sid, source_tokenized, torch.LongTensor( tids), candidates, src_lang, torch.LongTensor(target_langs) if __name__ == "__main__": parser = get_option_parser() (options, args) = parser.parse_args() print("Loading text processor...") text_processor = TextProcessor(options.tokenizer_path) num_processors = max(torch.cuda.device_count(), 1) print("Loading model...") model = Seq2Seq.load(Seq2Seq, options.model, tok_dir=options.tokenizer_path) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) num_gpu = torch.cuda.device_count() assert num_gpu <= 1 if options.fp16: model = amp.initialize(model, opt_level="O2") max_capacity = options.total_capacity * 1000000 with torch.no_grad(), open(options.output, "w") as writer: print("Loading data...") with open(options.sens, "rb") as fp, open(options.data, "rb") as fp2: sentences = marshal.load(fp) src2dst_dict = marshal.load(fp2)
def train(options): lex_dict = None if options.dict_path is not None: lex_dict = get_lex_dict(options.dict_path) if not os.path.exists(options.model_path): os.makedirs(options.model_path) text_processor = TextProcessor(options.tokenizer_path) assert text_processor.pad_token_id() == 0 image_captioner = Seq2Seq.load(ImageCaptioning, options.pretrained_path, tok_dir=options.tokenizer_path) txt2ImageModel = Caption2Image( text_processor=text_processor, enc_layer=options.encoder_layer, embed_dim=options.embed_dim, intermediate_dim=options.intermediate_layer_dim) print("Model initialization done!") # We assume that the collator function returns a list with the size of number of gpus (in case of cpus, collator = dataset.ImageTextCollator() num_batches = max(1, torch.cuda.device_count()) optimizer = build_optimizer(txt2ImageModel, options.learning_rate, warump_steps=options.warmup) trainer = Caption2ImageTrainer( model=txt2ImageModel, caption_model=image_captioner, mask_prob=options.mask_prob, optimizer=optimizer, clip=options.clip, beam_width=options.beam_width, max_len_a=options.max_len_a, max_len_b=options.max_len_b, len_penalty_ratio=options.len_penalty_ratio, fp16=options.fp16, mm_mode=options.mm_mode) pin_memory = torch.cuda.is_available() img_train_loader = ImageMTTrainer.get_img_loader( collator, dataset.ImageCaptionDatasetwNegSamples, options.train_path, txt2ImageModel, num_batches, options, pin_memory, lex_dict=lex_dict) img_dev_loader = ImageMTTrainer.get_img_loader( collator, dataset.ImageCaptionDatasetwNegSamples, options.dev_path, txt2ImageModel, num_batches, options, pin_memory, lex_dict=lex_dict, shuffle=False, denom=2) step, train_epoch = 0, 1 while options.step > 0 and step < options.step: print("train epoch", train_epoch) step = trainer.train_epoch(img_data_iter=img_train_loader, img_dev_data_iter=img_dev_loader, max_step=options.step, lex_dict=lex_dict, saving_path=options.model_path, step=step) train_epoch += 1
import random from seq2seq import Seq2Seq, END, pad_arrays from mcts import Node, mcts from preprocess import tokenize model = Seq2Seq.load('model') # Load base compounds starting_mols = set() with open('data/base_compounds.smi', 'r') as f: for smi in f: starting_mols.add(smi.strip()) print('Base compounds:', len(starting_mols)) def to_doc(mol): toks = tokenize(mol) return [model.vocab2id['<S>']] + [model.vocab2id[tok] for tok in toks] + [END] def process_seq(seq): # Convert ids to tokens and drop START/END tokens smis = ''.join([model.id2vocab[id] for id in seq if id not in [0, 1]]) parts = smis.split('>') if len(parts) > 1: # There shouldn't be more than two parts reactants, reagents = parts[0], parts[1] else: reactants = parts[0]
def train(options): lex_dict = None if options.dict_path is not None: lex_dict = get_lex_dict(options.dict_path) if options.local_rank <= 0 and not os.path.exists(options.model_path): os.makedirs(options.model_path) text_processor = TextProcessor(options.tokenizer_path) assert text_processor.pad_token_id() == 0 num_processors = max(torch.cuda.device_count(), 1) if options.local_rank < 0 else 1 if options.pretrained_path is not None: mt_model = Seq2Seq.load(ImageMassSeq2Seq, options.pretrained_path, tok_dir=options.tokenizer_path) else: mt_model = ImageMassSeq2Seq( use_proposals=lex_dict is not None, tie_embed=options.tie_embed, text_processor=text_processor, resnet_depth=options.resnet_depth, lang_dec=options.lang_decoder, enc_layer=options.encoder_layer, dec_layer=options.decoder_layer, embed_dim=options.embed_dim, intermediate_dim=options.intermediate_layer_dim) if options.lm_path is not None: lm = LM(text_processor=text_processor, enc_layer=options.encoder_layer, embed_dim=options.embed_dim, intermediate_dim=options.intermediate_layer_dim) mt_model.init_from_lm(lm) print("Model initialization done!") # We assume that the collator function returns a list with the size of number of gpus (in case of cpus, collator = dataset.ImageTextCollator() num_batches = max(1, torch.cuda.device_count()) if options.continue_train: with open(os.path.join(options.pretrained_path, "optim"), "rb") as fp: optimizer = pickle.load(fp) else: optimizer = build_optimizer(mt_model, options.learning_rate, warump_steps=options.warmup) trainer = ImageMTTrainer(model=mt_model, mask_prob=options.mask_prob, optimizer=optimizer, clip=options.clip, beam_width=options.beam_width, max_len_a=options.max_len_a, max_len_b=options.max_len_b, len_penalty_ratio=options.len_penalty_ratio, fp16=options.fp16, mm_mode=options.mm_mode, rank=options.local_rank) pin_memory = torch.cuda.is_available() img_train_loader = ImageMTTrainer.get_img_loader( collator, dataset.ImageCaptionDataset, options.train_path, mt_model, num_batches, options, pin_memory, lex_dict=lex_dict) mass_train_data, mass_train_loader, finetune_loader, mt_dev_loader = None, None, None, None if options.mass_train_path is not None: mass_train_paths = options.mass_train_path.strip().split(",") if options.step > 0: mass_train_data, mass_train_loader = ImageMTTrainer.get_mass_loader( mass_train_paths, mt_model, num_processors, options, pin_memory, keep_examples=options.finetune_step > 0, lex_dict=lex_dict) if options.finetune_step > 0: finetune_loader, finetune_data = ImageMTTrainer.get_mass_finetune_data( mass_train_data, mass_train_paths, mt_model, num_processors, options, pin_memory, lex_dict=lex_dict) mt_train_loader = None if options.mt_train_path is not None: mt_train_loader = ImageMTTrainer.get_mt_train_data( mt_model, num_processors, options, pin_memory, lex_dict=lex_dict) mt_dev_loader = None if options.mt_dev_path is not None: mt_dev_loader = ImageMTTrainer.get_mt_dev_data(mt_model, options, pin_memory, text_processor, trainer, lex_dict=lex_dict) step, train_epoch = 0, 1 while options.step > 0 and step < options.step: print("train epoch", train_epoch) step = trainer.train_epoch(img_data_iter=img_train_loader, mass_data_iter=mass_train_loader, mt_train_iter=mt_train_loader, max_step=options.step, lex_dict=lex_dict, mt_dev_iter=mt_dev_loader, saving_path=options.model_path, step=step, save_opt=options.save_opt, accum=options.accum) train_epoch += 1 finetune_epoch = 0 # Resetting the optimizer for the purpose of finetuning. trainer.optimizer.reset() lang_directions = ImageMTTrainer.get_lang_dirs(options.bt_langs, text_processor) print(options.local_rank, "lang dirs", lang_directions) print(options.local_rank, "Reloading image train data with new batch size...") if options.finetune_step > 0 and img_train_loader is not None: img_train_loader = ImageMTTrainer.get_img_loader( collator, dataset.ImageCaptionDataset, options.train_path, mt_model, num_batches, options, pin_memory, denom=2, lex_dict=lex_dict) if options.ignore_mt_mass: mt_train_loader = None print(options.local_rank, "Reloading image train data with new batch size done!") while options.finetune_step > 0 and step <= options.finetune_step + options.step: print(options.local_rank, "finetune epoch", finetune_epoch) step = trainer.train_epoch(img_data_iter=img_train_loader, mass_data_iter=finetune_loader, mt_train_iter=mt_train_loader, max_step=options.finetune_step + options.step, mt_dev_iter=mt_dev_loader, saving_path=options.model_path, step=step, fine_tune=True, lang_directions=lang_directions, lex_dict=lex_dict, save_opt=options.save_opt, accum=options.accum, beam_width=options.bt_beam_width) finetune_epoch += 1
def train(options): if not os.path.exists(options.model_path): os.makedirs(options.model_path) text_processor = TextProcessor(options.tokenizer_path) assert text_processor.pad_token_id() == 0 num_processors = max(torch.cuda.device_count(), 1) mt_model = SenSim(text_processor=text_processor, enc_layer=options.encoder_layer, embed_dim=options.embed_dim, intermediate_dim=options.intermediate_layer_dim) if options.pretrained_path is not None: pret = Seq2Seq.load(Seq2Seq, options.pretrained_path, tok_dir=options.tokenizer_path) mt_model.init_from_lm(pret) print("Model initialization done!") optimizer = build_optimizer(mt_model, options.learning_rate, warump_steps=options.warmup) trainer = SenSimTrainer(model=mt_model, mask_prob=options.mask_prob, optimizer=optimizer, clip=options.clip, fp16=options.fp16) pin_memory = torch.cuda.is_available() mt_train_loader = SenSimTrainer.get_mt_train_data( mt_model, num_processors, options, pin_memory) src_neg_data = dataset.MassDataset( batch_pickle_dir=options.src_neg, max_batch_capacity=num_processors * options.total_capacity * 5, max_batch=num_processors * options.batch * 5, pad_idx=mt_model.text_processor.pad_token_id(), keep_pad_idx=False, max_seq_len=options.max_seq_len, keep_examples=False) dst_neg_data = dataset.MassDataset( batch_pickle_dir=options.dst_neg, max_batch_capacity=num_processors * options.total_capacity * 5, max_batch=num_processors * options.batch * 5, pad_idx=mt_model.text_processor.pad_token_id(), keep_pad_idx=False, max_seq_len=options.max_seq_len, keep_examples=False) src_neg_loader = data_utils.DataLoader(src_neg_data, batch_size=1, shuffle=True, pin_memory=pin_memory) dst_neg_loader = data_utils.DataLoader(dst_neg_data, batch_size=1, shuffle=True, pin_memory=pin_memory) mt_dev_loader = None if options.mt_dev_path is not None: mt_dev_loader = SenSimTrainer.get_mt_dev_data( mt_model, options, pin_memory, text_processor, trainer, ) step, train_epoch = 0, 1 trainer.best_loss = 1000000 while options.step > 0 and step < options.step: print("train epoch", train_epoch) step = trainer.train_epoch(mt_train_iter=mt_train_loader, max_step=options.step, mt_dev_iter=mt_dev_loader, saving_path=options.model_path, step=step, src_neg_iter=src_neg_loader, dst_neg_iter=dst_neg_loader) train_epoch += 1
def train(options): lex_dict = None if options.dict_path is not None: lex_dict = get_lex_dict(options.dict_path) if not os.path.exists(options.model_path): os.makedirs(options.model_path) text_processor = TextProcessor(options.tokenizer_path) assert text_processor.pad_token_id() == 0 if options.pretrained_path is not None: caption_model = Seq2Seq.load(ImageCaptioning, options.pretrained_path, tok_dir=options.tokenizer_path) else: caption_model = ImageCaptioning( use_proposals=lex_dict is not None, tie_embed=options.tie_embed, text_processor=text_processor, resnet_depth=options.resnet_depth, lang_dec=options.lang_decoder, enc_layer=options.encoder_layer, dec_layer=options.decoder_layer, embed_dim=options.embed_dim, intermediate_dim=options.intermediate_layer_dim, use_obj=not options.no_obj) if options.lm_path is not None: # In our case, this is an MT model. mt_pret_model = Seq2Seq.load(ImageMassSeq2Seq, options.lm_path, tok_dir=options.tokenizer_path) assert len(caption_model.encoder.encoder.layer) == len( mt_pret_model.encoder.encoder.layer) assert len(caption_model.decoder.decoder.layer) == len( mt_pret_model.decoder.decoder.layer) caption_model.encoder = mt_pret_model.encoder caption_model.decoder = mt_pret_model.decoder caption_model.output_layer = mt_pret_model.output_layer print("Model initialization done!") # We assume that the collator function returns a list with the size of number of gpus (in case of cpus, collator = dataset.ImageTextCollator() num_batches = max(1, torch.cuda.device_count()) if options.continue_train: with open(os.path.join(options.pretrained_path, "optim"), "rb") as fp: optimizer = pickle.load(fp) else: optimizer = build_optimizer(caption_model, options.learning_rate, warump_steps=options.warmup) trainer = ImageCaptionTrainer( model=caption_model, mask_prob=options.mask_prob, optimizer=optimizer, clip=options.clip, beam_width=options.beam_width, max_len_a=options.max_len_a, max_len_b=options.max_len_b, len_penalty_ratio=options.len_penalty_ratio, fp16=options.fp16, mm_mode=options.mm_mode) pin_memory = torch.cuda.is_available() img_train_loader = ImageMTTrainer.get_img_loader( collator, dataset.ImageCaptionDataset, options.train_path, caption_model, num_batches, options, pin_memory, lex_dict=lex_dict, shuffle=(options.local_rank < 0)) num_processors = max(torch.cuda.device_count(), 1) if options.local_rank < 0 else 1 mt_train_loader = None if options.mt_train_path is not None: mt_train_loader = ImageMTTrainer.get_mt_train_data( caption_model, num_processors, options, pin_memory, lex_dict=lex_dict) img_dev_loader = ImageMTTrainer.get_img_loader( collator, dataset.ImageCaptionTestDataset, options.dev_path, caption_model, num_batches, options, pin_memory, lex_dict=lex_dict, shuffle=False, denom=2) trainer.caption_reference = None if img_dev_loader is not None: trainer.caption_reference = defaultdict(list) generator = (trainer.generator.module if hasattr( trainer.generator, "module") else trainer.generator) for data in img_dev_loader: for batch in data: for b in batch: captions = b["captions"] for id in captions: for caption in captions[id]: refs = get_outputs_until_eos( text_processor.sep_token_id(), caption, remove_first_token=True) ref = [ generator.seq2seq_model.text_processor. tokenizer.decode(ref.numpy()) for ref in refs ] trainer.caption_reference[id] += ref print("Number of dev image/captions", len(trainer.caption_reference)) mt_dev_loader = None if options.mt_dev_path is not None: mt_dev_loader = ImageMTTrainer.get_mt_dev_data(caption_model, options, pin_memory, text_processor, trainer, lex_dict=lex_dict) print("Number of dev sentences", len(trainer.reference)) step, train_epoch = 0, 1 while options.step > 0 and step < options.step: print("train epoch", train_epoch) step = trainer.train_epoch(img_data_iter=img_train_loader, img_dev_data_iter=img_dev_loader, max_step=options.step, lex_dict=lex_dict, mt_train_iter=mt_train_loader, saving_path=options.model_path, step=step, accum=options.accum, mt_dev_iter=mt_dev_loader, mtl_weight=options.mtl_weight) train_epoch += 1