def initialize(self, ctx): torch.set_num_threads(1) self.manifest = ctx.manifest properties = ctx.system_properties model_dir = properties.get("model_dir") serialized_file = self.manifest['model']['serializedFile'] model_pt_path = os.path.join(model_dir, serialized_file) setup_config_path = os.path.join(model_dir, "setup_config.json") if os.path.isfile(setup_config_path): with open(setup_config_path) as setup_config_file: self.setup_config = json.load(setup_config_file) else: logger.warning('Missing the setup_config.json file.') #Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode #further setup config can be added. # model_pt_path = '../ckpt-0189000.bin' self.device = torch.device( "cpu" ) #"cuda:" + str(properties.get("gpu_id")) if torch.cuda.is_available() else "cpu") if self.setup_config["save_mode"] == "torchscript": self.model = torch.jit.load(model_pt_path) self.tokenizer = NGRAMTokenizer(self.setup_config["ngram"]) elif self.setup_config["save_mode"] == "pretrained": state = torch.load(model_pt_path, map_location=self.device) config = AlbertConfig(**state['config_dict']) self.model = Consonant(config) self.model.load_state_dict(state['model_state_dict']) self.tokenizer = NGRAMTokenizer(state["ngram"]) else: logger.warning('Missing the checkpoint or state_dict.') self.model.to(self.device) self.model.eval() logger.debug( 'Transformer model from path {0} loaded successfully'.format( model_pt_path)) self.initialized = True
def load_tokenizer_model(ckpt): state = torch.load(ckpt, map_location=torch.device('cpu')) tokenizer = NGRAMTokenizer(state['ngram']) config = AlbertConfig(**state['config_dict']) model = Consonant(config) model.load_state_dict(state['model_state_dict']) step = int(ckpt.split('-')[-1].split('.')[0]) return tokenizer, model, state['ngram'], step
def write_examples(job_id, args, corpus_lines, phase): # Distribute N split_data files per job """A single process creating and writing out pre-processed examples.""" def log(*args): msg = " ".join(map(str, args)) print("Job {}:".format(job_id), msg) tokenizer = NGRAMTokenizer(args.ngram) example_writer = ExampleWriter(job_id, f"{args.output_dir}/{phase}", args.max_char_length, num_jobs=args.num_processes, tokenizer=tokenizer, blanks_separate_docs=False) log("Creating example writer") log("Writing wiki examples") example_writer.write_examples(input_corpus=corpus_lines) example_writer.finish() log("Done!")
def make_parser(): parser = argparse.ArgumentParser() # model archiecture configuration parser.add_argument('--max_position_embeddings', default=100, type=int) parser.add_argument('--embedding_size', default=128, type=int) parser.add_argument('--hidden_size', default=512, type=int) parser.add_argument('--intermediate_size', default=2048, type=int) parser.add_argument('--num_attention_heads', default=8, type=int) parser.add_argument('--num_hidden_layers', default=12, type=int) parser.add_argument('--num_hidden_groups', default=1, type=int) parser.add_argument('--ngram', default=3, type=int) parser.add_argument('--output_vocab_size', default=589, type=int) parser.add_argument('--type_vocab_size', default=1, type=int) parser.add_argument('--classifier_dropout_prob', default=0.1, type=float) # train/validation configuration parser.add_argument('--train_batch_size', default=780, type=int) parser.add_argument('--learning_rate', default=3e-4, type=float) parser.add_argument('--adam_epsilon', default=1e-6, type=float) parser.add_argument('--warmup_steps', default=10000, type=int) parser.add_argument('--weight_decay', default=0.01, type=float) parser.add_argument('--max_grad_norm', default=1.0, type=float) parser.add_argument('--max_steps', default=1000000, type=int) parser.add_argument('--save_checkpoint_steps', default=3000, type=int) parser.add_argument('--validation_step', default=0, type=int) parser.add_argument('--save_log_steps', default=100, type=int) parser.add_argument('--grad_accum_steps', type=int, default=1) # experiment configuration parser.add_argument('--exp_name', default='baseline_b780_n1', type=str) parser.add_argument('--output_dir', default='output', type=str) parser.add_argument('--gpus', default='0', type=str) parser.add_argument('--n_gpu', default=1, type=int) parser.add_argument('--num_workers', default=4, type=int) parser.add_argument('--seed', default=42, type=int, help='random seed for initialization') parser.add_argument('--do_train', action='store_true') parser.add_argument('--do_eval', action='store_true') parser.add_argument('--benchmark', default=False, type=bool) args = parser.parse_args() args.vocab_size = len(NGRAMTokenizer(args.ngram).head2id) # quad-gram : 456979 / tri-gram : 17579 / bi-gram : 679 / uni-gram : 29 args.pretrain_dataset_dir=f'/home/ubuntu/consonant_transformer/dataset/processed/comments_{args.ngram}_{args.max_position_embeddings}' return args
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--data-dir", default="/home/jovyan/dingbro/consonant_transformer/dataset", help="Location of data (vocab file, corpus, etc).") parser.add_argument("--input-file", default="raw_ratings.txt", type=str, help="Location of data (vocab file, corpus, etc).") parser.add_argument("--output-dir-prefix", default="ratings", type=str, help="Location of data (vocab file, corpus, etc).") parser.add_argument("--ngram", default=3, type=int, help="Number of n-gram for consonant tuples") parser.add_argument("--max-char-length", default=100, type=int, help="Number of tokens per example.") parser.add_argument("--num-processes", default=4, type=int, help="Parallelize across multiple processes.") parser.add_argument("--seed", default=777, type=int, help="Initial Seed") # parser.add_argument("--do-lower-case", dest='do_lower_case', # action='store_true', help="Lower case input text.") # parser.add_argument("--no-lower-case", dest='do_lower_case', # action='store_false', help="Don't lower case input text.") parser.set_defaults(do_lower_case=True) args = parser.parse_args() args.input_file = os.path.join(args.data_dir, 'raw', args.input_file) args.output_name = f"{args.output_dir_prefix}_{args.ngram}_{args.max_char_length}" args.output_dir = os.path.join(args.data_dir, 'processed', args.output_name) print('input', args.input_file) print('output', args.output_name) print("output dir", args.output_dir) if not os.path.isdir(args.output_dir): rmkdir(args.output_dir) rmkdir(args.output_dir + '/train') rmkdir(args.output_dir + '/val') # Read dataset and shuffle set_seed(args) with open(args.input_file, 'r') as f: lines = f.readlines() random.shuffle(lines) # Split dataset into train/val train_lines = lines[:int(len(lines) * 0.8)] val_lines = lines[int(len(lines) * 0.8):] print("Train set: ", len(train_lines), "Val set: ", len(val_lines)) tokenizer = NGRAMTokenizer(3) example_writer = ExampleWriter(0, args.output_dir + '/train', args.max_char_length, num_jobs=1, tokenizer=tokenizer, blanks_separate_docs=False) example_writer.write_examples(input_corpus=train_lines) example_writer.finish() example_writer = ExampleWriter(0, args.output_dir + '/val', args.max_char_length, num_jobs=1, tokenizer=tokenizer, blanks_separate_docs=False) example_writer.write_examples(input_corpus=val_lines) example_writer.finish()
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--data-dir", default="/home/ubuntu/consonant_transformer/dataset", help="Location of data (vocab file, corpus, etc).") parser.add_argument("--input-file", default="news_comments_0.txt", type=str, help="Location of data (vocab file, corpus, etc).") parser.add_argument("--output-dir-prefix", default="comments", type=str, help="Location of data (vocab file, corpus, etc).") parser.add_argument("--ngram", default=1, type=int, help="Number of n-gram for consonant tuples") parser.add_argument("--train-ratio", default=0.9, type=float, help="train-val ratio") parser.add_argument("--max-char-length", default=100, type=int, help="Number of tokens per example.") parser.add_argument("--num-processes", default=14, type=int, help="Parallelize across multiple processes.") parser.add_argument("--seed", default=777, type=int, help="Initial Seed") # parser.add_argument("--do-lower-case", dest='do_lower_case', # action='store_true', help="Lower case input text.") # parser.add_argument("--no-lower-case", dest='do_lower_case', # action='store_false', help="Don't lower case input text.") parser.set_defaults(do_lower_case=True) args = parser.parse_args() args.input_file = os.path.join(args.data_dir, 'raw', args.input_file) args.output_name = f"{args.output_dir_prefix}_{args.ngram}_{args.max_char_length}" args.output_dir = os.path.join(args.data_dir, 'processed', args.output_name) print('input', args.input_file) print('output', args.output_name) print("output dir", args.output_dir) if not os.path.isdir(args.output_dir): rmkdir(args.output_dir) rmkdir(args.output_dir + '/train') rmkdir(args.output_dir+'/val') # Read dataset and shuffle print("Starting reading file") set_seed(args) lines = [] for input_file in ['news_comments_0.txt', 'news_comments_1.txt', 'news_comments_2.txt', 'news_comments_3.txt']: fname = os.path.join(args.data_dir, 'raw', input_file) with open(fname, 'r') as f: lines += f.readlines() random.shuffle(lines) print(f"!! Read {len(lines)} lines !!") # Split dataset into train/val train_lines = lines[:int(len(lines) * args.train_ratio)] val_lines = lines[int(len(lines) * args.train_ratio):] print("Ngram: ", args.ngram) print("Max char lenght: ", args.max_char_length) print("Train set: ", len(train_lines), "Val set: ", len(val_lines)) tokenizer = NGRAMTokenizer(args.ngram) print("Head2id size: ", len(tokenizer.head2id)) print("Midtail2id size: ", len(tokenizer.midtail2id)) # example_writer = ExampleWriter(0, args.output_dir+'/train', args.max_char_length, num_jobs=1, tokenizer=tokenizer, blanks_separate_docs=False) # example_writer.write_examples(input_corpus=train_lines) # example_writer.finish() # example_writer = ExampleWriter(0, args.output_dir+'/val', args.max_char_length, num_jobs=1, tokenizer=tokenizer, blanks_separate_docs=False) # example_writer.write_examples(input_corpus=val_lines) # example_writer.finish() phase = 'train' if args.num_processes == 1: write_examples(0, args, train_lines, phase) else: split_lines = list(chunks(train_lines, args.num_processes)) jobs = [] for i in range(args.num_processes): job = multiprocessing.Process(target=write_examples, args=(i, args, split_lines[i], phase)) jobs.append(job) job.start() for job in jobs: job.join() phase = 'val' if args.num_processes == 1: write_examples(0, args, val_lines, phase) else: split_lines = list(chunks(val_lines, args.num_processes)) jobs = [] for i in range(args.num_processes): job = multiprocessing.Process(target=write_examples, args=(i, args, split_lines[i], phase)) jobs.append(job) job.start() for job in jobs: job.join()
import torch from transformers import AlbertModel, AlbertConfig from consonant.model.modeling import Consonant from consonant.model.tokenization import NGRAMTokenizer if __name__ == '__main__': ckpt = '../ckpt-0078000.bin' device = torch.device( "cpu" ) #"cuda:" + str(properties.get("gpu_id")) if torch.cuda.is_available() else "cpu") state = torch.load(ckpt, map_location=device) print(state['ngram']) config = AlbertConfig(**state['config_dict']) config.attention_probs_dropout_prob = 0.0 config.hidden_dropout_prob = 0.0 print(config) model = Consonant(config) model.load_state_dict(state['model_state_dict']) tokenizer = NGRAMTokenizer(1) inputs = tokenizer.encode("sample text", max_char_length=100, return_attention_mask=True) input_ids = torch.tensor([inputs["head_ids"]], dtype=torch.long) traced_model = torch.jit.trace(model, [input_ids, input_ids]) torch.jit.save(traced_model, "traced_model.pt")
class TransformersSeqClassifierHandler(BaseHandler, ABC): """ Transformers handler class for sequence, token classification and question answering. """ def __init__(self): super(TransformersSeqClassifierHandler, self).__init__() self.initialized = False def initialize(self, ctx): torch.set_num_threads(1) self.manifest = ctx.manifest properties = ctx.system_properties model_dir = properties.get("model_dir") serialized_file = self.manifest['model']['serializedFile'] model_pt_path = os.path.join(model_dir, serialized_file) setup_config_path = os.path.join(model_dir, "setup_config.json") if os.path.isfile(setup_config_path): with open(setup_config_path) as setup_config_file: self.setup_config = json.load(setup_config_file) else: logger.warning('Missing the setup_config.json file.') #Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode #further setup config can be added. # model_pt_path = '../ckpt-0189000.bin' self.device = torch.device( "cpu" ) #"cuda:" + str(properties.get("gpu_id")) if torch.cuda.is_available() else "cpu") if self.setup_config["save_mode"] == "torchscript": self.model = torch.jit.load(model_pt_path) self.tokenizer = NGRAMTokenizer(self.setup_config["ngram"]) elif self.setup_config["save_mode"] == "pretrained": state = torch.load(model_pt_path, map_location=self.device) config = AlbertConfig(**state['config_dict']) self.model = Consonant(config) self.model.load_state_dict(state['model_state_dict']) self.tokenizer = NGRAMTokenizer(state["ngram"]) else: logger.warning('Missing the checkpoint or state_dict.') self.model.to(self.device) self.model.eval() logger.debug( 'Transformer model from path {0} loaded successfully'.format( model_pt_path)) self.initialized = True def preprocess(self, data): """ Basic text preprocessing, based on the user's chocie of application mode. """ logger.info(str(data)) text = data[0].get("data") if text is None: text = data[0].get("body") # with txt file if isinstance(text, dict): logger.info( " ############## Got Dict !! ##########################") input_text = text['text'] else: input_text = text.decode('utf-8') max_length = int(self.setup_config["max_length"]) logger.info("Received text: '%s'", input_text) logger.info(input_text) # input_text = "안녕하세요? 반갑습니다. 오늘 날씨가 정말 끝내줘요. 너 너무 사랑스러워요" inputs = self.tokenizer.encode(input_text, max_char_length=max_length, return_attention_mask=True) return inputs def inference(self, inputs): """ Predict the class (or classes) of the received text using the serialized transformers checkpoint. """ input_ids = torch.tensor([inputs["head_ids"]], dtype=torch.long).to(self.device) attention_masks = torch.tensor([inputs["attention_masks"]], dtype=torch.bool).to(self.device) # Handling inference for sequence_classification. with torch.no_grad(): output = self.model(input_ids, attention_masks) predict_label = output[0].argmax(dim=2) predict_string = self.tokenizer.decode_sent( input_ids[0].detach().cpu().numpy(), predict_label[0].detach().cpu().numpy()) logger.info("Model predicted: '%s'", predict_string) return [{'predict': predict_string}] def postprocess(self, inference_output): # TODO: Add any needed post-processing of the model predictions here return inference_output