def train_mentioned_model(train_data, train_segs, validate_data, validate_segs, vectorizer, train_model): model_name = train_model[0] start = train_model[1] end = train_model[2] logger.info("start train %s mentioned", model_name) train_data_size = config.train_data_size sum_label_val = (end - start + 1) * 2 column_list = range(start, end + 1) ori_labels = train_data.iloc[0:train_data_size, column_list] # convert labels , # all the three labels equal -2 means mentioned this item,covert it to 1 # else convert it to 0 train_label = ori_labels.T.sum().abs() // sum_label_val logger.debug("begin to train data") cw = "balanced" mentioned_clf = TextClassifier(vectorizer=vectorizer, class_weight=cw) mentioned_clf.fit(train_segs, train_label) logger.debug("begin to validate %s mentioned model", model_name) # load validate model ori_labels = validate_data.iloc[0:, column_list] validate_labels = ori_labels.T.sum().abs() // sum_label_val y_pre = mentioned_clf.predict(validate_segs) report(validate_labels, y_pre) score = f1_score(validate_labels, y_pre, average="macro") logger.info("validate done! %s mentioned model score:%s", model_name, str(score)) if score > 0.8: logger.info("save %s mentioned model", model_name) model_save_path = config.model_save_path if not os.path.exists(model_save_path): os.makedirs(model_save_path) joblib.dump(mentioned_clf, model_save_path + model_name + "_mentioned.pkl", compress=3) return mentioned_clf
def train_specific_model(train_data): columns = train_data.columns.values.tolist() logger.debug("begin to seg train content") content_segments = seg_words( train_data.content.iloc[0:config.train_data_size]) logger.debug("seg train content done") vectorizer = joblib.load(config.model_save_path + vec_name) logger.debug("load vectorizer") validate_data_df = load_data_from_csv(config.validate_data_path) validata_segs = seg_words(validate_data_df.content) logger.debug("seg validate content") scores = dict() for model_name in columns[:-1]: logger.info("begin to train %s model", model_name) cw = [{ -2: a, -1: b, 0: w, 1: x } for a in range(1, 3) for b in range(5, 8) for w in range(8, 12) for x in range(5, 8)] # cw = {0: 7, 1: 6, -1: 6, -2: 1} positive_clf = TextClassifier(vectorizer=vectorizer, class_weight=cw) y_label = train_data[model_name].iloc[0:config.train_data_size] positive_clf.fit(content_segments, y_label) y_pre = positive_clf.predict(validata_segs) y_true = validate_data_df[model_name].iloc[0:] report(y_true, y_pre) score = f1_score(y_true, y_pre, average="macro") logger.info("score for model:%s is %s ", model_name, str(score)) scores[model_name] = score joblib.dump(positive_clf, config.model_save_path + model_name + ".pkl", compress=True) score = np.mean(list(scores.values())) logger.info("f1_scores: %s" % score)
def build_model(vocab_size, embedding_dims, conv_channels, embeddings=None) -> Module: _vocab_size = vocab_size _embedding_dims = embedding_dims _conv_channels = conv_channels model = TextClassifier(vocab_size, embedding_dims, conv_channels, 2, embeddings=embeddings) return model
def build_model(configs, embeddings=None) -> Module: _vocab_size = configs['vocab_size'] _embedding_dims = configs['embedding_size'] _conv_block_in_channels = configs['conv_block_in_channels'] _conv_block_out_channels = configs['conv_block_out_channels'] _linear_in_channels = configs['linear_in_channels'] _linear_out_channels = configs['linear_out_channels'] _k_max_pool = configs['k_max_pool'] _num_classes = configs['num_classes'] _embeddings = embeddings model = TextClassifier(_vocab_size, _embedding_dims, _conv_block_in_channels, _conv_block_out_channels, _linear_in_channels, _linear_out_channels, _k_max_pool, _num_classes, embeddings=_embeddings) return model
def build_model(configs, embeddings=None) -> Module: _vocab_size = configs['vocab_size'] _embedding_dims = configs['embedding_size'] _conv_block_in_channels = configs['conv_block_in_channels'] _conv_block_out_channels = configs['conv_block_out_channels'] _bi_rnn_in_channels = configs['bi_rnn_in_channels'] _bi_rnn_out_channels = configs['bi_rnn_out_channels'] _num_classes = configs['num_classes'] _conv_block_kernel_size = configs['conv_block_kernel_size'] _embeddings = embeddings model = TextClassifier(_vocab_size, _embedding_dims, _conv_block_in_channels, _conv_block_out_channels, _conv_block_kernel_size, _bi_rnn_in_channels, _bi_rnn_out_channels, _num_classes, embeddings=_embeddings) return model
import config import numpy as np from sklearn import metrics # 读入预处理好的数据 Train_seq_set = np.load(config.train_sequence_path) Train_tags_set = np.load(config.train_tags_path) Train_label_set = np.load(config.train_label_path) Validation_seq = np.load(config.validation_sequence_path) Validation_tags_set = np.load(config.validation_tags_path) Validation_label = np.load(config.validation_label_path) train_input = np.concatenate((Train_seq_set, Train_tags_set), axis=1) validation_input = np.concatenate((Validation_seq,Validation_tags_set), axis=1) from model import TextClassifier clf = TextClassifier()
(train_data, valid_data), batch_size=args['batch_size'], sort_key=lambda x: len(x.text), sort_within_batch=True, device=device) # print preprocessed text # print(vars(training_data.examples[0])) wandb_logger = WandbLogger(name=args['wandb_run_name'], project=args['wandb_project_name']) wandb_logger.log_hyperparams(args) model = TextClassifier(args, TEXT=TEXT, LABEL=LABEL, train_iterator=train_iterator, valid_iterator=valid_iterator, wandb_logger=wandb_logger) # wandb.watch(model) trainer = pl.Trainer(gpus=int(args['gpus']), progress_bar_refresh_rate=args['progress_bar_refresh_rate'], max_epochs=args['epochs'], logger=[wandb_logger], early_stop_callback=True) trainer.fit(model) ckpt_base_path = os.path.dirname(args['model_ckpt_path']) os.makedirs(ckpt_base_path, exist_ok=True) trainer.save_checkpoint(args['model_ckpt_path'])
from Preprocessing import Preprocessor from model import TextClassifier import config import numpy as np from sklearn import metrics # 读入预处理好的数据 Train_seq_set = np.load(config.train_sequence_path) Train_label_set = np.load(config.train_label_path) Validation_seq = np.load(config.validation_sequence_path) Validation_label = np.load(config.validation_label_path) p = Preprocessor() model1 = TextClassifier(len(config.class_group[0]), 'lstm') model2 = TextClassifier(len(config.class_group[1]), 'lstm') model3 = TextClassifier(len(config.class_group[2]), 'lstm') model4 = TextClassifier(len(config.class_group[3]), 'lstm') model5 = TextClassifier(len(config.class_group[4]), 'lstm') model6 = TextClassifier(len(config.class_group[5]), 'lstm') F1 = [] for i in range(30): # 训练模型 train_seq1, train_label1 = p.shuffle(Train_seq_set[0], Train_label_set[0]) model1.train(train_seq1, [train_label1[l] for l in range(len(config.class_group[0]))], Validation_seq, [ Validation_label[config.class_group[0][i]] for i in range(len(config.class_group[0])) ])
content_train = train_data_df.iloc[:, 1] content_train = seg_words(content_train) columns = train_data_df.columns.values.tolist() vectorizer_tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 5), min_df=5, norm='l2') vectorizer_tfidf.fit(content_train) # model train classifier_dict = dict() for column in columns[2:]: label_train = train_data_df[column] text_classifier = TextClassifier(vectorizer=vectorizer_tfidf) text_classifier.fit(content_train, label_train) classifier_dict[column] = text_classifier # validate model content_validate = validate_data_df.iloc[:, 1] content_validate = seg_words(content_validate) f1_score_dict = dict() for column in columns[2:]: label_validate = validate_data_df[column] text_classifier = classifier_dict[column] f1_score = text_classifier.get_f1_score(content_validate, label_validate) f1_score_dict[column] = f1_score
def parse_args(args=None): parser = LightningArgumentParser() parser.add_argument('--datamodule', type=str) temp_args, extras = parser.parse_known_args(args) dm_cls = datamodule_map.get(temp_args.datamodule, None) if dm_cls is None: raise RuntimeError( f'given datamodule: "{temp_args.datamodule}" does not exist') parser.add_datamodule_args(dm_cls) parser.add_model_args(TextClassifier) parser.add_trainer_args() return parser.parse_lit_args(extras), dm_cls if __name__ == '__main__': args, dm_cls = parse_args() pl.seed_everything(args.datamodule.seed) dm = dm_cls.from_argparse_args(args.datamodule) dm.setup('fit') model = TextClassifier(dm.model_name_or_path, dm.label2id, **vars(args.model)) model.tokenizer = dm.tokenizer model.total_steps = ( (len(dm.ds['train']) // (args.datamodule.batch_size * max(1, (args.trainer.gpus or 0)))) // args.trainer.accumulate_grad_batches * float(args.trainer.max_epochs)) trainer = pl.Trainer.from_argparse_args(args.trainer) trainer.fit(model, dm) trainer.test(datamodule=dm) model.save_pretrained("outputs")
torch.manual_seed(seed) random.seed(seed) corpus = Corpus(trainFile, devFile, minFreq) print('Vocabulary size: ' + str(corpus.voc.size())) print('# of classes: ' + str(corpus.classVoc.size())) print() print('# of training samples: ' + str(len(corpus.trainData))) print('# of dev samples: ' + str(len(corpus.devData))) classifier = TextClassifier(corpus.voc.size(), embedDim, hiddenDim, corpus.classVoc.size(), biDirectional, repType='Sen', actType='Tanh') if useGpu: if torch.cuda.is_available(): torch.cuda.manual_seed(seed) classifier.cuda() print('**** Running with GPU ****\n') else: useGpu = False print('**** Warning: GPU is not available ****\n') criterionClassifier = nn.CrossEntropyLoss(size_average=True)