#!/usr/bin/env python # coding:utf-8 import pandas as pd from tqdm import tqdm from fasttext import train_supervised import fasttext import os from __init__ import * from src.utils import config from src.utils.config import root_path from src.utils.tools import create_logger, clean_symbols, query_cut, rm_stop_word logger = create_logger(root_path + '/logs/Fasttext.log') tqdm.pandas() class Fasttext(object): """ 使用fasttext 训练文本分类的模型 """ def __init__(self, train_raw_path=config.train_path, test_raw_path=config.test_path, valid_raw_path=config.valid_path, model_train_file=root_path + '/data/fast_train.txt', model_test_file=root_path + '/data/fast_test.txt', model_valid_file=root_path + '/data/fast_valid.txt', model_path=None): """ 初始化参数 :param train_raw_path: 原始训练文件路径
predict_all, target_names=config.label_list, digits=4) confusion = metrics.confusion_matrix(labels_all, predict_all) return acc, loss_total / len(data_iter), report, confusion return acc, loss_total / len(data_iter) if __name__ == '__main__': # model_name = args.model # x = import_module('models.' + model_name) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed_all(1) torch.backends.cudnn.deterministic = True # 保证每次结果一样 logger = create_logger(config.root_path + '/logs/train.log') logger.info('Building tokenizer') print('config.bert_path is ', config.bert_path) tokenizer = BertTokenizer.from_pretrained(config.bert_path) logger.info('Loading dataset') # 数据集的定义 train_dataset = BertDataset(config.train_path, tokenizer=tokenizer, word=args.word) train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, collate_fn=collate_fn, shuffle=True) dev_dataset = BertDataset(config.valid_path,
from sklearn.naive_bayes import MultinomialNB from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from transformers import BertModel, BertTokenizer from __init__ import * from src.data.mlData import MLData from src.utils import config from src.utils.config import root_path from src.utils.tools import (Grid_Train_model, bayes_parameter_opt_lgb, query_cut, create_logger, formate_data, get_score) from src.utils.feature import (get_embedding_feature, get_img_embedding, get_lda_features, get_pretrain_embedding, get_autoencoder_feature, get_basic_feature) logger = create_logger(config.log_dir + 'model.log') class Models(object): def __init__(self, model_path=None, feature_engineer=False, train_mode=True): ''' @description: initlize Class, EX: model @param {type} : feature_engineer: whether using feature engineering, if `False`, then compare common ML models res_model: res network model resnext_model: resnext network model wide_model: wide res network model bert: bert model
@Description: train embedding & tfidf & autoencoder @FilePath: /bookClassification/src/word2vec/embedding.py ''' import pandas as pd from gensim import models from sklearn.feature_extraction.text import TfidfVectorizer import joblib from gensim.models import LdaMulticore from gensim.models.ldamodel import LdaModel import gensim from __init__ import * from src.utils.config import root_path from src.utils.tools import create_logger, query_cut from src.word2vec.autoencoder import AutoEncoder logger = create_logger(root_path + '/logs/embedding.log') class SingletonMetaclass(type): ''' @description: singleton ''' def __init__(self, *args, **kwargs): self.__instance = None super().__init__(*args, **kwargs) def __call__(self, *args, **kwargs): if self.__instance is None: self.__instance = super(SingletonMetaclass, self).__call__(*args, **kwargs) return self.__instance
@Date: 2020-04-08 17:21:28 @LastEditTime: 2020-07-17 16:43:02 @LastEditors: xiaoyao jiang @Description: Process data then get feature @FilePath: /bookClassification/src/data/mlData.py ''' import numpy as np import pandas as pd import json import os from __init__ import * from src.utils import config from src.utils.tools import create_logger, wam, query_cut from src.word2vec.embedding import Embedding logger = create_logger(config.log_dir + 'data.log') class MLData(object): def __init__(self, debug_mode=False, train_mode=True): ''' @description: initlize ML dataset class @param {type} debug_mode: if debug_Mode the only deal 10000 data em, new embedding class @return:None ''' # 加载embedding, 如果不训练, 则不处理数据 self.debug_mode = debug_mode self.em = Embedding() self.em.load()
import json import pandas as pd from imblearn.over_sampling import SMOTE from imblearn.under_sampling import ClusterCentroids from sklearn.ensemble import RandomForestClassifier import joblib from transformers import BertModel, BertTokenizer from __init__ import * from src.utils.tools import create_logger, bayes_parameter_opt_lgb from src.data.mlData import MLData from src.utils import config from src.utils.feature import get_embedding_feature, get_autoencoder_feature, get_score logger = create_logger(config.log_path + 'model.log') class Model(object): def __init__(self, model_path=None, feature_engineer=False, train_mode=True): # self.res_model = torchvision.models.resnet152(pretrained=True) # self.res_model = self.res_model.to(config.device) self.ml_data = MLData(debug_mode=True, train_mode=train_mode) if train_mode: self.model = lgb.LGBMClassifier(objective='multiclass', num_class=33, seed=11)
# @author: hongyue.pei # @file: fasttext.py # @time: 2020/9/23 下午3:09 # @desc: import pandas as pd from tqdm import tqdm import fasttext import jieba from __init__ import * from src.utils import config from src.utils.tools import create_logger logger = create_logger(config.root_path + '/logs/Fasttext.log') class Fasttext(object): def __init__(self, train_raw_path=config.root_path + '/data/train.csv', test_raw_path=config.root_path + '/data/test.csv', model_train_file=config.root_path + '/data/fast_train.csv', model_test_file=config.root_path + '/data/fast_test.csv', model_path=None): stopWords = open(config.root_path + '/data/stopWords_cn.txt').readlines() jieba.load_userdict(config.root_path + '/data/ai100_words.txt') if model_path is None: self.train_raw_data = pd.read_csv(train_raw_path, ',',
precision, recall, F1 = calculate_f1(result) print("-" * 20 + "intent" + "-" * 20) print("\t Precision: %.2f" % (100 * precision)) print("\t Recall: %.2f" % (100 * recall)) print("\t F1: %.2f" % (100 * F1)) return F1 if __name__ == '__main__': debug = False config = model.Config() np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed_all(1) torch.backends.cudnn.deterministic = True # 保证每次结果一样 logger = create_logger('../logs/train.log') logger.info('Building tokenizer') print('config.bert_path is ', config.bert_path) tokenizer = BertTokenizer.from_pretrained(config.bert_path) logger.info('Loading dataset') train_dataset = BertDataset(config.train_path, config.label_path, tokenizer=tokenizer, debug=debug, need_label_weight=True) train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, collate_fn=collate_fn, shuffle=True)
# coding:utf-8 import pandas as pd from src.utils.tools import create_logger, clean_symbols, query_cut, rm_stop_word from src.utils import config from tqdm import tqdm import gensim from gensim import models from src.utils.tools import timethis logger = create_logger(config.root_path + '/logs/embedding.log') tqdm.pandas() class SingletonMetaclass(type): ''' 单例模式 ''' def __init__(self, *args, **kwargs): self.__instance = None super().__init__(*args, **kwargs) def __call__(self, *args, **kwargs): if self.__instance is None: self.__instance = super(SingletonMetaclass, self).__call__(*args, **kwargs) return self.__instance else: return self.__instance class Embedding(metaclass=SingletonMetaclass): def __init__(self):