def load_my_dataset(filepath, datafolderpath, labelfilepath, test_ratio=0.9): labellist = load_label(labelfilepath) with open(filepath, "r") as readfile: filepathlist = [ os.path.join(datafolderpath, line.strip()) for line in readfile.readlines() ] datasets = list() for imagefilepath in filepathlist: print(imagefilepath) img = load_data(imagefilepath) filename = imagefilepath.split(os.path.sep)[-2] label = filename index = labellist.index(label) datasets.append([img, index]) random.shuffle(datasets) index = int(len(datasets) * test_ratio) traindatasets = datasets[:index] testdatasets = datasets[index:] train_imgs = [train_data[0] for train_data in traindatasets] train_labels = [train_data[1] for train_data in traindatasets] test_imgs = [test_data[0] for test_data in testdatasets] test_labels = [test_data[1] for test_data in testdatasets] return chainer.datasets.tuple_dataset.TupleDataset(train_imgs, train_labels), \ chainer.datasets.tuple_dataset.TupleDataset(test_imgs, test_labels)
def data_load(self, label): data_path = os.path.join(self.opt.dataroot, self.opt.cap_scheme, label) # ex. ../data/min24/synthetic self.num_train_samples = min(self.opt.train_size, len(os.listdir(os.path.join(data_path, 'train')))) # number of train data in directory self.num_test_sample = min(2000, len(os.listdir(os.path.join(data_path, 'test')))) # number of test data in directory # load training set x_train = np.empty((self.num_train_samples, self.opt.loadHeight, self.opt.loadWidth, 1), dtype='uint8') # initialize y_train = np.empty( (self.num_train_samples, self.opt.cap_len * self.opt.char_set_len), dtype='uint8') # initialize train_labels = util.load_label( os.path.join(data_path, label + '_train.txt')) for i in range(self.num_train_samples): img_name = os.path.join(data_path, 'train', str(i) + '.png') x_train[i, :, :, :] = util.load_image(img_name) try: y_train[i, :] = self.text2vec(train_labels[i]) except: print(i) # load test set x_test = np.empty( (self.num_test_sample, self.opt.loadHeight, self.opt.loadWidth, 1), dtype='uint8') y_test = np.empty( (self.num_test_sample, self.opt.cap_len * self.opt.char_set_len), dtype='uint8') test_labels = util.load_label( os.path.join(data_path, label + '_test.txt')) for i in range(self.num_test_sample): img_name = os.path.join(data_path, 'test', str(i) + '.png') x_test[i, :, :, :] = util.load_image(img_name) try: y_test[i, :] = self.text2vec(test_labels[i]) except: print(i) return (x_train, y_train), (x_test, y_test)
def main(): """ main """ n_units_h1, n_units_h2, n_out,\ modelpath, labelfilepath, imagefilepath = importingargs() model = L.Classifier(LinearNet(n_units_h1, n_units_h2, n_out)) chainer.serializers.load_npz(modelpath, model) img = load_data(imagefilepath) labellist = load_label(labelfilepath) index, prob, label = predict(model, img, labellist) print("index: %d, prob: %f, label: %s" % (index, prob, label))
def train_classifier(): X_train, y_train, X_test, y_test = load_label(split=True) vectorizer = joblib.load( data_path / "tfidf_vectorizer.pkl" ) # vectorizer dumped on cleaning.py x = vectorizer.transform(X_train) classifier = RandomForestClassifier(max_depth=10, random_state=42) classifier.fit(x, y_train) joblib.dump(classifier, model_path / "classifier.pkl") test_array = vectorizer.transform([X_train[1]]) test = classifier.predict(test_array) print( accuracy_score( [classifier.predict(vectorizer.transform([a]).toarray()) for a in X_test], y_test, ) )
def load_model(self, checkpoint): config = self.config opt = config['opt'] labels = load_label(opt.label_path) label_size = len(labels) config['labels'] = labels self.labels = labels if config['emb_class'] == 'glove': if config['enc_class'] == 'gnb': model = TextGloveGNB(config, opt.embedding_path, label_size) if config['enc_class'] == 'cnn': model = TextGloveCNN(config, opt.embedding_path, label_size, emb_non_trainable=True) if config['enc_class'] == 'densenet-cnn': model = TextGloveDensenetCNN(config, opt.embedding_path, label_size, emb_non_trainable=True) if config['enc_class'] == 'densenet-dsa': model = TextGloveDensenetDSA(config, opt.embedding_path, label_size, emb_non_trainable=True) else: from transformers import AutoTokenizer, AutoConfig, AutoModel bert_config = AutoConfig.from_pretrained(opt.bert_output_dir) bert_tokenizer = AutoTokenizer.from_pretrained(opt.bert_output_dir) bert_model = AutoModel.from_config(bert_config) ModelClass = TextBertCNN if config['enc_class'] == 'cls': ModelClass = TextBertCLS model = ModelClass(config, bert_config, bert_model, bert_tokenizer, label_size) model.load_state_dict(checkpoint) model = model.to(opt.device) logger.info("[Model loaded]") return model
def load_model(config, checkpoint): opt = config['opt'] labels = load_label(opt.label_path) label_size = len(labels) config['labels'] = labels if config['emb_class'] == 'glove': if config['enc_class'] == 'gnb': model = TextGloveGNB(config, opt.embedding_path, label_size) if config['enc_class'] == 'cnn': model = TextGloveCNN(config, opt.embedding_path, label_size, emb_non_trainable=True) if config['enc_class'] == 'densenet-cnn': model = TextGloveDensenetCNN(config, opt.embedding_path, label_size, emb_non_trainable=True) if config['enc_class'] == 'densenet-dsa': model = TextGloveDensenetDSA(config, opt.embedding_path, label_size, emb_non_trainable=True) else: from transformers import AutoTokenizer, AutoConfig, AutoModel bert_config = AutoConfig.from_pretrained(opt.bert_output_dir) bert_tokenizer = AutoTokenizer.from_pretrained(opt.bert_output_dir) bert_model = AutoModel.from_config(bert_config) ModelClass = TextBertCNN if config['enc_class'] == 'cls': ModelClass = TextBertCLS model = ModelClass(config, bert_config, bert_model, bert_tokenizer, label_size) if opt.enable_qat: assert opt.device == 'cpu' model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm') ''' # fuse if applicable # model = torch.quantization.fuse_modules(model, [['']]) ''' model = torch.quantization.prepare_qat(model) model.eval() model.to('cpu') logger.info("[Convert to quantized model with device=cpu]") model = torch.quantization.convert(model) if opt.enable_qat_fx: import torch.quantization.quantize_fx as quantize_fx qconfig_dict = { "": torch.quantization.get_default_qat_qconfig('fbgemm') } model = quantize_fx.prepare_qat_fx(model, qconfig_dict) logger.info("[Convert to quantized model]") model = quantize_fx.convert_fx(model) model.load_state_dict(checkpoint) model = model.to(opt.device) ''' for name, param in model.named_parameters(): print(name, param.data, param.device, param.requires_grad) ''' logger.info("[model] :\n{}".format(model.__str__())) logger.info("[Model loaded]") return model
parser.add_argument("--label-num", "-l", type=int, default=5) parser.add_argument("--modelpath", "-mf", help="model path") parser.add_argument("--labelfilepath", "-lf", help="labelfile") parser.add_argument("--imagefilepath", "-if", help="imagefilepath you want to predict") args = parser.parse_args() return args.first_hidden_layer_units,\ args.second_hidden_layer_units, args.label_num,\ args.modelpath, args.labelfilepath, args.imagefilepath if __name__ == "__main__": n_units_h1, n_units_h2, n_out,\ modelpath, labelfilepath, imagefilepath = importingargs() model = L.Classifier(LinearNet(n_units_h1, n_units_h2, n_out)) chainer.serializers.load_npz(modelpath, model) labellist = load_label(labelfilepath) cascade_path = 'haarcascade_frontalface_alt.xml' face_cascade = cv2.CascadeClassifier(cascade_path) cap = cv2.VideoCapture(0) frame_count = 0 face_count = 0 while True: frame_count += 1 face_count = 0 # 内蔵カメラから読み込んだキャプチャデータを取得 ret, frame = cap.read()
def load_model(config, checkpoint): args = config['args'] labels = load_label(args.label_path) label_size = len(labels) config['labels'] = labels if config['emb_class'] == 'glove': if config['enc_class'] == 'gnb': model = TextGloveGNB(config, args.embedding_path, label_size) if config['enc_class'] == 'cnn': model = TextGloveCNN(config, args.embedding_path, label_size, emb_non_trainable=True) if config['enc_class'] == 'densenet-cnn': model = TextGloveDensenetCNN(config, args.embedding_path, label_size, emb_non_trainable=True) if config['enc_class'] == 'densenet-dsa': model = TextGloveDensenetDSA(config, args.embedding_path, label_size, emb_non_trainable=True) else: if config['emb_class'] == 'bart' and config['use_kobart']: from transformers import BartModel from kobart import get_kobart_tokenizer, get_pytorch_kobart_model bert_tokenizer = get_kobart_tokenizer() bert_tokenizer.cls_token = '<s>' bert_tokenizer.sep_token = '</s>' bert_tokenizer.pad_token = '<pad>' bert_model = BartModel.from_pretrained(get_pytorch_kobart_model()) bert_config = bert_model.config elif config['emb_class'] in ['gpt']: bert_tokenizer = AutoTokenizer.from_pretrained( args.bert_output_dir) bert_tokenizer.bos_token = '<|startoftext|>' bert_tokenizer.eos_token = '<|endoftext|>' bert_tokenizer.cls_token = '<|startoftext|>' bert_tokenizer.sep_token = '<|endoftext|>' bert_tokenizer.pad_token = '<|pad|>' bert_config = AutoConfig.from_pretrained(args.bert_output_dir) bert_model = AutoModel.from_pretrained(args.bert_output_dir) elif config['emb_class'] in ['t5']: from transformers import T5EncoderModel bert_tokenizer = AutoTokenizer.from_pretrained( args.bert_output_dir) bert_tokenizer.cls_token = '<s>' bert_tokenizer.sep_token = '</s>' bert_tokenizer.pad_token = '<pad>' bert_config = AutoConfig.from_pretrained(args.bert_output_dir) bert_model = T5EncoderModel(bert_config) else: bert_tokenizer = AutoTokenizer.from_pretrained( args.bert_output_dir) bert_config = AutoConfig.from_pretrained(args.bert_output_dir) bert_model = AutoModel.from_config(bert_config) ModelClass = TextBertCNN if config['enc_class'] == 'cls': ModelClass = TextBertCLS if config['enc_class'] == 'densenet-cnn': ModelClass = TextBertDensenetCNN model = ModelClass(config, bert_config, bert_model, bert_tokenizer, label_size) if args.enable_qat: assert args.device == 'cpu' model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm') ''' # fuse if applicable # model = torch.quantization.fuse_modules(model, [['']]) ''' model = torch.quantization.prepare_qat(model) model.eval() model.to('cpu') logger.info("[Convert to quantized model with device=cpu]") model = torch.quantization.convert(model) if args.enable_qat_fx: import torch.quantization.quantize_fx as quantize_fx qconfig_dict = { "": torch.quantization.get_default_qat_qconfig('fbgemm') } model = quantize_fx.prepare_qat_fx(model, qconfig_dict) logger.info("[Convert to quantized model]") model = quantize_fx.convert_fx(model) if args.enable_diffq: quantizer = DiffQuantizer(model) config['quantizer'] = quantizer quantizer.restore_quantized_state(checkpoint) else: model.load_state_dict(checkpoint) model = model.to(args.device) ''' for name, param in model.named_parameters(): print(name, param.data, param.device, param.requires_grad) ''' logger.info("[model] :\n{}".format(model.__str__())) logger.info("[Model loaded]") return model
def prepare_model(config, bert_model_name_or_path=None): args = config['args'] emb_non_trainable = not args.embedding_trainable labels = load_label(args.label_path) label_size = len(labels) config['labels'] = labels # prepare model if config['emb_class'] == 'glove': if config['enc_class'] == 'gnb': model = TextGloveGNB(config, args.embedding_path, label_size) if config['enc_class'] == 'cnn': model = TextGloveCNN(config, args.embedding_path, label_size, emb_non_trainable=emb_non_trainable) if config['enc_class'] == 'densenet-cnn': model = TextGloveDensenetCNN(config, args.embedding_path, label_size, emb_non_trainable=emb_non_trainable) if config['enc_class'] == 'densenet-dsa': model = TextGloveDensenetDSA(config, args.embedding_path, label_size, emb_non_trainable=emb_non_trainable) else: model_name_or_path = args.bert_model_name_or_path if bert_model_name_or_path: model_name_or_path = bert_model_name_or_path if config['emb_class'] == 'bart' and config['use_kobart']: from transformers import BartModel from kobart import get_kobart_tokenizer, get_pytorch_kobart_model bert_tokenizer = get_kobart_tokenizer() bert_tokenizer.cls_token = '<s>' bert_tokenizer.sep_token = '</s>' bert_tokenizer.pad_token = '<pad>' bert_model = BartModel.from_pretrained(get_pytorch_kobart_model()) elif config['emb_class'] in ['gpt']: bert_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) bert_tokenizer.bos_token = '<|startoftext|>' bert_tokenizer.eos_token = '<|endoftext|>' bert_tokenizer.cls_token = '<|startoftext|>' bert_tokenizer.sep_token = '<|endoftext|>' bert_tokenizer.pad_token = '<|pad|>' bert_model = AutoModel.from_pretrained( model_name_or_path, from_tf=bool(".ckpt" in model_name_or_path)) # 3 new tokens added bert_model.resize_token_embeddings(len(bert_tokenizer)) elif config['emb_class'] in ['t5']: from transformers import T5EncoderModel bert_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) bert_tokenizer.cls_token = '<s>' bert_tokenizer.sep_token = '</s>' bert_tokenizer.pad_token = '<pad>' bert_model = T5EncoderModel.from_pretrained( model_name_or_path, from_tf=bool(".ckpt" in model_name_or_path)) else: bert_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) bert_model = AutoModel.from_pretrained( model_name_or_path, from_tf=bool(".ckpt" in model_name_or_path)) bert_config = bert_model.config # bert model reduction reduce_bert_model(config, bert_model, bert_config) ModelClass = TextBertCNN if config['enc_class'] == 'cls': ModelClass = TextBertCLS if config['enc_class'] == 'densenet-cnn': ModelClass = TextBertDensenetCNN model = ModelClass(config, bert_config, bert_model, bert_tokenizer, label_size, feature_based=args.bert_use_feature_based, finetune_last=args.bert_use_finetune_last) if args.restore_path: checkpoint = load_checkpoint(args.restore_path) model.load_state_dict(checkpoint) if args.enable_qat: model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm') ''' # fuse if applicable # model = torch.quantization.fuse_modules(model, [['']]) ''' model = torch.quantization.prepare_qat(model) if args.enable_qat_fx: import torch.quantization.quantize_fx as quantize_fx model.train() qconfig_dict = { "": torch.quantization.get_default_qat_qconfig('fbgemm') } model = quantize_fx.prepare_qat_fx(model, qconfig_dict) logger.info("[model] :\n{}".format(model.__str__())) logger.info("[model prepared]") return model
def prepare_model(config, bert_model_name_or_path=None): opt = config['opt'] emb_non_trainable = not opt.embedding_trainable labels = load_label(opt.label_path) label_size = len(labels) config['labels'] = labels # prepare model if config['emb_class'] == 'glove': if config['enc_class'] == 'gnb': model = TextGloveGNB(config, opt.embedding_path, label_size) if config['enc_class'] == 'cnn': model = TextGloveCNN(config, opt.embedding_path, label_size, emb_non_trainable=emb_non_trainable) if config['enc_class'] == 'densenet-cnn': model = TextGloveDensenetCNN(config, opt.embedding_path, label_size, emb_non_trainable=emb_non_trainable) if config['enc_class'] == 'densenet-dsa': model = TextGloveDensenetDSA(config, opt.embedding_path, label_size, emb_non_trainable=emb_non_trainable) else: model_name_or_path = opt.bert_model_name_or_path if bert_model_name_or_path: model_name_or_path = bert_model_name_or_path from transformers import AutoTokenizer, AutoConfig, AutoModel bert_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) bert_model = AutoModel.from_pretrained( model_name_or_path, from_tf=bool(".ckpt" in model_name_or_path)) bert_config = bert_model.config # bert model reduction reduce_bert_model(config, bert_model, bert_config) ModelClass = TextBertCNN if config['enc_class'] == 'cls': ModelClass = TextBertCLS model = ModelClass(config, bert_config, bert_model, bert_tokenizer, label_size, feature_based=opt.bert_use_feature_based) if opt.restore_path: checkpoint = load_checkpoint(opt.restore_path, device=opt.device) model.load_state_dict(checkpoint) if opt.enable_qat: model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm') ''' # fuse if applicable # model = torch.quantization.fuse_modules(model, [['']]) ''' model = torch.quantization.prepare_qat(model) if opt.enable_qat_fx: import torch.quantization.quantize_fx as quantize_fx model.train() qconfig_dict = { "": torch.quantization.get_default_qat_qconfig('fbgemm') } model = quantize_fx.prepare_qat_fx(model, qconfig_dict) model.to(opt.device) logger.info("[model] :\n{}".format(model.__str__())) logger.info("[model prepared]") return model