Esempio n. 1
0
def train_mentioned_model(train_data, train_segs, validate_data, validate_segs,
                          vectorizer, train_model):
    model_name = train_model[0]
    start = train_model[1]
    end = train_model[2]
    logger.info("start train %s mentioned", model_name)
    train_data_size = config.train_data_size
    sum_label_val = (end - start + 1) * 2
    column_list = range(start, end + 1)
    ori_labels = train_data.iloc[0:train_data_size, column_list]
    # convert labels ,
    # all the three labels equal -2 means mentioned this item,covert it to 1
    # else convert it to 0
    train_label = ori_labels.T.sum().abs() // sum_label_val
    logger.debug("begin to train data")
    cw = "balanced"
    mentioned_clf = TextClassifier(vectorizer=vectorizer, class_weight=cw)
    mentioned_clf.fit(train_segs, train_label)
    logger.debug("begin to validate %s mentioned model", model_name)
    # load validate model
    ori_labels = validate_data.iloc[0:, column_list]
    validate_labels = ori_labels.T.sum().abs() // sum_label_val
    y_pre = mentioned_clf.predict(validate_segs)
    report(validate_labels, y_pre)
    score = f1_score(validate_labels, y_pre, average="macro")
    logger.info("validate done! %s mentioned model score:%s", model_name,
                str(score))

    if score > 0.8:
        logger.info("save %s mentioned model", model_name)
        model_save_path = config.model_save_path
        if not os.path.exists(model_save_path):
            os.makedirs(model_save_path)

        joblib.dump(mentioned_clf,
                    model_save_path + model_name + "_mentioned.pkl",
                    compress=3)
    return mentioned_clf
Esempio n. 2
0
def train_specific_model(train_data):
    columns = train_data.columns.values.tolist()
    logger.debug("begin to seg train content")
    content_segments = seg_words(
        train_data.content.iloc[0:config.train_data_size])
    logger.debug("seg train content done")
    vectorizer = joblib.load(config.model_save_path + vec_name)
    logger.debug("load vectorizer")
    validate_data_df = load_data_from_csv(config.validate_data_path)
    validata_segs = seg_words(validate_data_df.content)
    logger.debug("seg validate content")
    scores = dict()
    for model_name in columns[:-1]:
        logger.info("begin to train %s model", model_name)
        cw = [{
            -2: a,
            -1: b,
            0: w,
            1: x
        } for a in range(1, 3) for b in range(5, 8) for w in range(8, 12)
              for x in range(5, 8)]
        # cw = {0: 7, 1: 6, -1: 6, -2: 1}
        positive_clf = TextClassifier(vectorizer=vectorizer, class_weight=cw)
        y_label = train_data[model_name].iloc[0:config.train_data_size]
        positive_clf.fit(content_segments, y_label)

        y_pre = positive_clf.predict(validata_segs)
        y_true = validate_data_df[model_name].iloc[0:]
        report(y_true, y_pre)
        score = f1_score(y_true, y_pre, average="macro")
        logger.info("score for model:%s is %s ", model_name, str(score))
        scores[model_name] = score
        joblib.dump(positive_clf,
                    config.model_save_path + model_name + ".pkl",
                    compress=True)
    score = np.mean(list(scores.values()))
    logger.info("f1_scores: %s" % score)
Esempio n. 3
0
def build_model(vocab_size,
                embedding_dims,
                conv_channels,
                embeddings=None) -> Module:
    _vocab_size = vocab_size
    _embedding_dims = embedding_dims
    _conv_channels = conv_channels

    model = TextClassifier(vocab_size,
                           embedding_dims,
                           conv_channels,
                           2,
                           embeddings=embeddings)

    return model
def build_model(configs, embeddings=None) -> Module:
    _vocab_size = configs['vocab_size']
    _embedding_dims = configs['embedding_size']
    _conv_block_in_channels = configs['conv_block_in_channels']
    _conv_block_out_channels = configs['conv_block_out_channels']
    _linear_in_channels = configs['linear_in_channels']
    _linear_out_channels = configs['linear_out_channels']
    _k_max_pool = configs['k_max_pool']
    _num_classes = configs['num_classes']
    _embeddings = embeddings

    model = TextClassifier(_vocab_size,
                           _embedding_dims,
                           _conv_block_in_channels,
                           _conv_block_out_channels,
                           _linear_in_channels,
                           _linear_out_channels,
                           _k_max_pool,
                           _num_classes,
                           embeddings=_embeddings)

    return model
def build_model(configs, embeddings=None) -> Module:
    _vocab_size = configs['vocab_size']
    _embedding_dims = configs['embedding_size']
    _conv_block_in_channels = configs['conv_block_in_channels']
    _conv_block_out_channels = configs['conv_block_out_channels']
    _bi_rnn_in_channels = configs['bi_rnn_in_channels']
    _bi_rnn_out_channels = configs['bi_rnn_out_channels']
    _num_classes = configs['num_classes']
    _conv_block_kernel_size = configs['conv_block_kernel_size']
    _embeddings = embeddings

    model = TextClassifier(_vocab_size,
                           _embedding_dims,
                           _conv_block_in_channels,
                           _conv_block_out_channels,
                           _conv_block_kernel_size,
                           _bi_rnn_in_channels,
                           _bi_rnn_out_channels,
                           _num_classes,
                           embeddings=_embeddings)

    return model
Esempio n. 6
0
import config
import numpy as np
from sklearn import metrics

# 读入预处理好的数据
Train_seq_set = np.load(config.train_sequence_path)
Train_tags_set = np.load(config.train_tags_path)
Train_label_set = np.load(config.train_label_path)
Validation_seq = np.load(config.validation_sequence_path)
Validation_tags_set = np.load(config.validation_tags_path)
Validation_label = np.load(config.validation_label_path)

train_input = np.concatenate((Train_seq_set, Train_tags_set), axis=1)
validation_input = np.concatenate((Validation_seq,Validation_tags_set), axis=1)

from model import TextClassifier
clf = TextClassifier()
Esempio n. 7
0
        (train_data, valid_data),
        batch_size=args['batch_size'],
        sort_key=lambda x: len(x.text),
        sort_within_batch=True,
        device=device)

    # print preprocessed text
    # print(vars(training_data.examples[0]))

    wandb_logger = WandbLogger(name=args['wandb_run_name'],
                               project=args['wandb_project_name'])
    wandb_logger.log_hyperparams(args)

    model = TextClassifier(args,
                           TEXT=TEXT,
                           LABEL=LABEL,
                           train_iterator=train_iterator,
                           valid_iterator=valid_iterator,
                           wandb_logger=wandb_logger)
    # wandb.watch(model)

    trainer = pl.Trainer(gpus=int(args['gpus']),
                         progress_bar_refresh_rate=args['progress_bar_refresh_rate'],
                         max_epochs=args['epochs'],
                         logger=[wandb_logger],
                         early_stop_callback=True)
    trainer.fit(model)

    ckpt_base_path = os.path.dirname(args['model_ckpt_path'])
    os.makedirs(ckpt_base_path, exist_ok=True)

    trainer.save_checkpoint(args['model_ckpt_path'])
from Preprocessing import Preprocessor
from model import TextClassifier
import config
import numpy as np
from sklearn import metrics

# 读入预处理好的数据
Train_seq_set = np.load(config.train_sequence_path)
Train_label_set = np.load(config.train_label_path)
Validation_seq = np.load(config.validation_sequence_path)
Validation_label = np.load(config.validation_label_path)

p = Preprocessor()

model1 = TextClassifier(len(config.class_group[0]), 'lstm')
model2 = TextClassifier(len(config.class_group[1]), 'lstm')
model3 = TextClassifier(len(config.class_group[2]), 'lstm')
model4 = TextClassifier(len(config.class_group[3]), 'lstm')
model5 = TextClassifier(len(config.class_group[4]), 'lstm')
model6 = TextClassifier(len(config.class_group[5]), 'lstm')

F1 = []
for i in range(30):
    # 训练模型
    train_seq1, train_label1 = p.shuffle(Train_seq_set[0], Train_label_set[0])
    model1.train(train_seq1,
                 [train_label1[l] for l in range(len(config.class_group[0]))],
                 Validation_seq, [
                     Validation_label[config.class_group[0][i]]
                     for i in range(len(config.class_group[0]))
                 ])
Esempio n. 9
0
content_train = train_data_df.iloc[:, 1]
content_train = seg_words(content_train)

columns = train_data_df.columns.values.tolist()

vectorizer_tfidf = TfidfVectorizer(analyzer='word',
                                   ngram_range=(1, 5),
                                   min_df=5,
                                   norm='l2')
vectorizer_tfidf.fit(content_train)

# model train
classifier_dict = dict()
for column in columns[2:]:
    label_train = train_data_df[column]
    text_classifier = TextClassifier(vectorizer=vectorizer_tfidf)
    text_classifier.fit(content_train, label_train)
    classifier_dict[column] = text_classifier

# validate model
content_validate = validate_data_df.iloc[:, 1]

content_validate = seg_words(content_validate)

f1_score_dict = dict()
for column in columns[2:]:
    label_validate = validate_data_df[column]
    text_classifier = classifier_dict[column]
    f1_score = text_classifier.get_f1_score(content_validate, label_validate)
    f1_score_dict[column] = f1_score
Esempio n. 10
0
def parse_args(args=None):
    parser = LightningArgumentParser()
    parser.add_argument('--datamodule', type=str)
    temp_args, extras = parser.parse_known_args(args)
    dm_cls = datamodule_map.get(temp_args.datamodule, None)
    if dm_cls is None:
        raise RuntimeError(
            f'given datamodule: "{temp_args.datamodule}" does not exist')
    parser.add_datamodule_args(dm_cls)
    parser.add_model_args(TextClassifier)
    parser.add_trainer_args()
    return parser.parse_lit_args(extras), dm_cls


if __name__ == '__main__':
    args, dm_cls = parse_args()
    pl.seed_everything(args.datamodule.seed)
    dm = dm_cls.from_argparse_args(args.datamodule)
    dm.setup('fit')
    model = TextClassifier(dm.model_name_or_path, dm.label2id,
                           **vars(args.model))
    model.tokenizer = dm.tokenizer
    model.total_steps = (
        (len(dm.ds['train']) //
         (args.datamodule.batch_size * max(1, (args.trainer.gpus or 0)))) //
        args.trainer.accumulate_grad_batches * float(args.trainer.max_epochs))
    trainer = pl.Trainer.from_argparse_args(args.trainer)
    trainer.fit(model, dm)
    trainer.test(datamodule=dm)
    model.save_pretrained("outputs")
Esempio n. 11
0
torch.manual_seed(seed)
random.seed(seed)

corpus = Corpus(trainFile, devFile, minFreq)

print('Vocabulary size: ' + str(corpus.voc.size()))
print('# of classes:    ' + str(corpus.classVoc.size()))
print()
print('# of training samples: ' + str(len(corpus.trainData)))
print('# of dev samples:      ' + str(len(corpus.devData)))

classifier = TextClassifier(corpus.voc.size(),
                            embedDim,
                            hiddenDim,
                            corpus.classVoc.size(),
                            biDirectional,
                            repType='Sen',
                            actType='Tanh')

if useGpu:
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        classifier.cuda()
        print('**** Running with GPU ****\n')
    else:
        useGpu = False
        print('**** Warning: GPU is not available ****\n')

criterionClassifier = nn.CrossEntropyLoss(size_average=True)