def __load_config(self): # check the path if not self.file_path or not path.exists( self.file_path) or path.getsize(self.file_path) == 0: raise ConfigError('Config file %s does not exist or is empty' % self.file_path) # load the config file self.__config_json_obj = read_json(self.file_path) # check the config object if not self.__config_json_obj: raise ConfigError('Could not read config file %s' % self.file_path)
from models.sentiments.dataset.data_handler import load_datasets, load_test_datasets from models.sentiments.models.model_train_test import start_epochs, load_model import os import pandas as pd import numpy as np from utils.file_utils import write_json_dict, read_json root_dir = "/home/charan/DATA/311_Data/Problem/" final_data = os.path.join(root_dir, "311_VIZ_DESCRIPTION_PARENT.csv") write_json = os.path.join(root_dir, "class.json") load_model_path = "" label_dict = read_json(write_json) def setup_data(input_data): input_data['label'] = input_data.PARENT_CATEGORY.apply(lambda x: label_dict[x]) input_data.rename(columns={"CASE ID": "u_id", "Description": "desc"}, inplace=True) return input_data def train_classification(): classification_df = pd.read_csv(final_data) classification_df = setup_data(classification_df) number_of_classes = max(list(classification_df['label'].unique()))+1 model_directory = os.path.join(root_dir, "classify_dict") metrics_json = os.path.join(root_dir, "accuracy_metrics.json") training_loader, testing_loader = load_datasets(classification_df, train_size=0.8, number_of_classes=number_of_classes) unique_ids, val_targets, val_outputs = start_epochs(training_loader, testing_loader, metrics_json, model_directory, epochs=20, number_of_classes=number_of_classes)
from models.sentiments.dataset.data_handler import load_datasets, load_test_datasets from models.sentiments.models.model_train_test import start_epochs, load_model import os import pandas as pd import numpy as np from utils.file_utils import read_json root_dir = "/home/charan/DATA/311_Data/multi-level-classification" final_data = os.path.join(root_dir, "balanced_multi-level.csv") updated_data = os.path.join(root_dir, "balanced_multi-level_update.csv") cat_json = os.path.join(root_dir, "category_class.json") type_json = os.path.join(root_dir, "type_class.json") cat_json = read_json(cat_json) type_json = read_json(type_json) load_model_path = "" def setup_data(input_df): input_df["label"] = input_df["TYPE"].apply(lambda x: type_json[x]) input_df["u_id"] = input_df.index input_df.rename(columns={"Description": "desc"}, inplace=True) input_df.to_csv(updated_data, index=False) return input_df def train_classification(): classification_df = pd.read_csv(final_data) classification_df = setup_data(classification_df) number_of_classes = len(list(classification_df['label'].unique())) model_directory = os.path.join(root_dir, "classify_state_dict")
def run(mtd="fold_split"): def _eval(data): model.eval() # 不启用 BatchNormalization 和 Dropout # data = dev_data y_pred = [] y_true = [] with torch.no_grad(): for batch_data in dataset_processer.data_iter( data, config['test_batch_size'], shuffle=False): torch.cuda.empty_cache() batch_inputs, batch_labels = dataset_processer.batch2tensor( batch_data) batch_outputs = model(batch_inputs) y_pred.extend( torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) score, dev_f1 = scores.get_score(y_true, y_pred) return score, dev_f1 if mtd == "fold_split": demo_preprocess.split_dataset(raw_path, train_path, dev_path, test_path) elif mtd == "process_data": demo_preprocess.process_data(config, train_path, dev_path) elif mtd == "train": Train_data = file_utils.read_json(config["train_set"]) Dev_data = file_utils.read_json(config["dev_set"]) # 生成模型可处理的格式 train_data = dataset_processer.get_examples(Train_data, label_encoder) dev_data = dataset_processer.get_examples(Dev_data, label_encoder) del Train_data, Dev_data # 一个epoch的batch个数 batch_num = int( np.ceil(len(train_data) / float(config["train_batch_size"]))) print("batch_num:{}".format(batch_num)) # model = BertSoftmaxModel(cfg.bert_path, label_encoder) optimizer = Optimizer(model.all_parameters, steps=batch_num * config["epochs"]) # 优化器 # loss # criterion = nn.CrossEntropyLoss() # obj criterion = loss_factory.focal_loss() best_train_f1, best_dev_f1 = 0, 0 early_stop = -1 EarlyStopEpochs = 10 # 当多个epoch,dev的指标都没有提升,则早停 # train print("start train") for epoch in range(cfg.RESUME_EPOCH + 1, config["epochs"] + 1): optimizer.zero_grad() model.train() # 启用 BatchNormalization 和 Dropout overall_losses = 0 losses = 0 # batch_idx = 1 y_pred = [] y_true = [] step = 0 for batch_data in dataset_processer.data_iter( train_data, config["train_batch_size"], shuffle=True): torch.cuda.empty_cache() batch_inputs, batch_labels = dataset_processer.batch2tensor( batch_data) batch_outputs = model(batch_inputs) print(batch_outputs.shape) # loss = criterion(batch_outputs, batch_labels) loss.backward() loss_value = loss.detach().cpu().item() losses += loss_value overall_losses += loss_value y_pred.extend( torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) # nn.utils.clip_grad_norm_(optimizer.all_params, max_norm=config["clip"]) # 梯度裁剪 for cur_optim, scheduler in zip(optimizer.optims, optimizer.schedulers): cur_optim.step() scheduler.step() optimizer.zero_grad() step += 1 # print(step, time.time()) overall_losses /= batch_num overall_losses = scores.reformat(overall_losses, 4) score, train_f1 = scores.get_score(y_true, y_pred) print("epoch:{},train_score:{}, train_f1:{}, overall_loss:{} ". format(epoch, train_f1, score, overall_losses)) # if set(y_true) == set(y_pred): # print("report") # report = classification_report(y_true, y_pred, digits=4, target_names=label_encoder.target_names) # # logging.info('\n' + report) # print(report) # eval _, dev_f1 = _eval(data=dev_data) if best_dev_f1 < dev_f1: best_dev_f1 = dev_f1 early_stop = 0 best_train_f1 = train_f1 save_path = model_utils.save_checkpoint( model, epoch, save_folder=os.path.join(cfg.proj_path, "data/bert_nn")) print("save_path:{}".format(save_path)) # torch.save(model.state_dict(), save_model) else: early_stop += 1 if early_stop == EarlyStopEpochs: # 达到早停次数,则停止训练 break print( "early_stop:{}, score:{}, dev_f1:{}, best_train_f1:{}, best_dev_f1:{}" .format(early_stop, dev_f1, score, best_train_f1, best_dev_f1))
import os import pandas as pd from utils.file_utils import write_json_dict, read_json import numpy as np root_dir = "/home/charan/DATA/311_Data/" dept_path = os.path.join(root_dir, "Department/department.csv") prob_path = os.path.join(root_dir, "Problem/category.csv") df_path = os.path.join(root_dir, "311_VIZ_DESCRIPTION.csv") dept_json = os.path.join(root_dir, 'Department/parent_map.json') prob_json = os.path.join(root_dir, 'Problem/parent_map.json') dept_class_json = os.path.join(root_dir, 'Department/class.json') prob_class_json = os.path.join(root_dir, 'Problem/class.json') dept_parent = read_json(dept_json) prob_parent = read_json(prob_json) parent_df_path = os.path.join(root_dir, "311_VIZ_DESCRIPTION_PARENT.csv") dept_class_dict = read_json(dept_class_json) prob_class_dict = read_json(prob_class_json) class_df_path = os.path.join(root_dir, "311_VIZ_DESCRIPTION_PARENT_CLASS.csv") def class_json(input, output): class_dict = {} counter = 0 for key in list(input.keys()): class_dict[str(counter)] = key class_dict[key] = counter counter += 1 write_json_dict(class_dict, output)
# root_dir = "/home/charan/DATA/311_Data/multi-level-classification" # final_data = os.path.join(root_dir, "balanced_multi-level.csv") # cat_json = os.path.join(root_dir, "category_class.json") # type_json = os.path.join(root_dir, "type_class.json") # load_model_path = "" # label_cat = read_json(cat_json) # label_type = read_json(type_json) root_dir = "/home/charan/DATA/Data/DB_Pedia/archive/multi_level_classification" # root_dir = "/home/charan/DATA/311_Data/multi-level-feature-extracted" final_data = os.path.join(root_dir, "DBP_wiki_data_scaled_updated.csv") final_data_updated = os.path.join(root_dir, "DBP_wiki_data_scaled_updated.csv") l1_json = os.path.join(root_dir, "l1.json") l2_json = os.path.join(root_dir, "l2.json") load_model_path = "/home/charan/DATA/Data/DB_Pedia/archive/multi_level_classification/classify_dict_18.pt" l1_json = read_json(l1_json) l2_json = read_json(l2_json) def get_classes(input_dict): counter = 0 while str(counter) in input_dict: counter += 1 return counter def setup_data(input_data): input_data['label1'] = input_data.PARENT_CATEGORY.apply( lambda x: l1_json[x]) input_data['label2'] = input_data.TYPE.apply(lambda x: l2_json[x]) input_data['u_id'] = input_data.index
def test_read_json(self): json_obj = file_utils.read_json('tests/sdk/test-data/test_json') self.assertIsNotNone(json_obj) self.assertIsNotNone(json_obj.get('requests'))
import logging from logging.handlers import RotatingFileHandler import os from utils.file_utils import read_json config_path = os.path.join(os.getcwd(), 'configuration.json') config_data = read_json(config_path) class AppLogger: __instance = None @staticmethod def log_setup(): log_handler = logging.handlers.WatchedFileHandler(config_data["log_path"]) formatter = logging.Formatter('%(asctime)s [%(process)d]: %(message)s', '%b %d %H:%M:%S') log_handler.setFormatter(formatter) console_handler = logging.StreamHandler() console_handler.setFormatter(formatter) logger = logging.getLogger() logger.addHandler(console_handler) logger.addHandler(log_handler) logger.setLevel(logging.DEBUG) return logger @staticmethod def getInstance(): """ Static access method. """ if AppLogger.__instance is None: logger = AppLogger.log_setup()
import pandas as pd import os from utils.file_utils import read_json root_dir = "/home/charan/Documents/workspaces/python_workspaces/Data/BDA_Project" mapping_dict = os.path.join( root_dir, "Sentiment_Financial_Data/label_mapping_dict.json") news_data_path = os.path.join(root_dir, "news_data/news_with_summary.csv") news_classify_path = os.path.join(root_dir, "news_data/news_classification.csv") classification_processed = os.path.join(root_dir, "news_data/processed_sentiments.csv") merged_final = os.path.join(root_dir, "news_data/merged_final_news.csv") mapping_dict = read_json(mapping_dict) def final_review_companies(): with open('Review.json', 'r') as f: companies_dict = json.load(f) f.close() return companies_dict def extract_news_company_data(): companies = final_review_companies() news_data = load_dataframe() list_companies = list(set(companies.keys())) df_dict = {} for each_company in list_companies: