import time import torch import unittest from torch.utils.data import Dataset, DataLoader from sklearn.model_selection import train_test_split _cur_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append("%s/../../" % _cur_dir) from text_utils.utils.data_io import get_attr_values from text_utils.utils.label_encoder import LabelEncoder from text_utils.utils.logger import init_log from text_utils.models.torch.base_model import ClassificationModel from text_utils.models.torch.nets.bert import BertForClassification from text_utils.tokenizers.bert_tokenizer import BertTokenizer init_log(stream_level=logging.INFO) class ClassificationDataset(Dataset): """ 针对特定数据集,定义一个相关的取数据的方式 """ def __init__(self, data_list) : ## 一般init函数是加载所有数据 super(ClassificationDataset, self).__init__() # dataloader中的数据用numpy保存 # 相关issue: https://github.com/pytorch/pytorch/issues/13246 self.data_list = np.array(data_list) def __getitem__(self, index): # 得到单个数据
Date: 2020/06/04 15:59:43 """ import logging import json import os import re import sys _cur_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append("%s/../../" % _cur_dir) from text_utils.model.lr_model_multilabel_impl import BaseLRModel from text_utils.preprocess import ProcessFilePath from text_utils.feature.feature_generator import FeatureGenerator from text_utils.utils.logger import init_log init_log("./log/lr_model_multilabel.log") import lr_multiple_config as config class LRModelMultipleDemo(BaseLRModel): """LR多标签分类模型基础类 """ def __init__(self, mid_data_dir, model_dir, output_dir): """ """ super(LRModelMultipleDemo, self).__init__(model_dir, output_dir) self.mid_data_paths = ProcessFilePath(output_dir=mid_data_dir) self.generator_path = os.path.join(model_dir, "generator.pkl") self.feature_generator = FeatureGenerator(
import unittest from ernie.modeling_ernie import ErnieModel _cur_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append("%s/../" % _cur_dir) from text_utils.tokenizers.ernie_tokenizer import ErnieTokenizer from text_utils.tokenizers.lr_tokenizer import LRTokenizer from text_utils.utils.data_io import get_attr_values, gen_batch_data from text_utils.utils.data_io import write_to_file from text_utils.models.dygraph.train_infer_utils import batch_infer from text_utils.utils.logger import init_log from text_utils.models.machine_learning.cluster import mini_batch_kmeans, data_cluster from text_utils.utils.vectorizer import init_vectorizer init_log() class TestCluster(unittest.TestCase): @classmethod def setUpClass(cls): test_root = "./" TestCluster.test_output_dir = os.path.join(test_root, "output/test_clutser/") if not os.path.isdir(TestCluster.test_output_dir): os.mkdir(TestCluster.test_output_dir) test_data_dir = os.path.join(test_root, "dataset/classification_data/toutiao_news") example_num = 5 # 加载数据
_cur_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append("%s/../../" % _cur_dir) from process_data import process_origin_poetry from text_utils.models.torch.base_model import BertSeq2seqModel, model_distributed from text_utils.models.torch.nets.bert import BertForSeq2seq from text_utils.tokenizers.bert_tokenizer import BertTokenizer from text_utils.utils.data_io import get_data from text_utils.utils.logger import init_log torch.distributed.init_process_group(backend="nccl") LOCAL_RANK = torch.distributed.get_rank() logging_level = logging.INFO if LOCAL_RANK == 0 else logging.WARNING init_log(stream_level=logging_level) class PoetDataset(Dataset): """ 针对特定数据集,定义一个相关的取数据的方式 """ def __init__(self, data_dir, tokenizer) : ## 一般init函数是加载所有数据 super(PoetDataset, self).__init__() self.tokenizer = tokenizer # dataloader中的数据用numpy保存 # 相关issue: https://github.com/pytorch/pytorch/issues/13246 self.poet_info_list = np.array(self.gen_dataset(data_dir)) def gen_dataset(self, data_dir):