def __load_config(self):
     # check the path
     if not self.file_path or not path.exists(
             self.file_path) or path.getsize(self.file_path) == 0:
         raise ConfigError('Config file %s does not exist or is empty' %
                           self.file_path)
     # load the config file
     self.__config_json_obj = read_json(self.file_path)
     # check the config object
     if not self.__config_json_obj:
         raise ConfigError('Could not read config file %s' % self.file_path)
Ejemplo n.º 2
0
from models.sentiments.dataset.data_handler import load_datasets, load_test_datasets
from models.sentiments.models.model_train_test import start_epochs, load_model
import os
import pandas as pd
import numpy as np

from utils.file_utils import write_json_dict, read_json

root_dir = "/home/charan/DATA/311_Data/Problem/"
final_data = os.path.join(root_dir, "311_VIZ_DESCRIPTION_PARENT.csv")
write_json = os.path.join(root_dir, "class.json")
load_model_path = ""
label_dict = read_json(write_json)


def setup_data(input_data):
    input_data['label'] = input_data.PARENT_CATEGORY.apply(lambda x: label_dict[x])
    input_data.rename(columns={"CASE ID": "u_id", "Description": "desc"}, inplace=True)
    return input_data


def train_classification():
    classification_df = pd.read_csv(final_data)
    classification_df = setup_data(classification_df)
    number_of_classes = max(list(classification_df['label'].unique()))+1
    model_directory = os.path.join(root_dir, "classify_dict")
    metrics_json = os.path.join(root_dir, "accuracy_metrics.json")
    training_loader, testing_loader = load_datasets(classification_df, train_size=0.8,
                                                    number_of_classes=number_of_classes)
    unique_ids, val_targets, val_outputs = start_epochs(training_loader, testing_loader, metrics_json, model_directory,
                                                        epochs=20, number_of_classes=number_of_classes)
from models.sentiments.dataset.data_handler import load_datasets, load_test_datasets
from models.sentiments.models.model_train_test import start_epochs, load_model
import os
import pandas as pd
import numpy as np

from utils.file_utils import read_json

root_dir = "/home/charan/DATA/311_Data/multi-level-classification"
final_data = os.path.join(root_dir, "balanced_multi-level.csv")
updated_data = os.path.join(root_dir, "balanced_multi-level_update.csv")
cat_json = os.path.join(root_dir, "category_class.json")
type_json = os.path.join(root_dir, "type_class.json")
cat_json = read_json(cat_json)
type_json = read_json(type_json)
load_model_path = ""


def setup_data(input_df):
    input_df["label"] = input_df["TYPE"].apply(lambda x: type_json[x])
    input_df["u_id"] = input_df.index
    input_df.rename(columns={"Description": "desc"}, inplace=True)
    input_df.to_csv(updated_data, index=False)
    return input_df


def train_classification():
    classification_df = pd.read_csv(final_data)
    classification_df = setup_data(classification_df)
    number_of_classes = len(list(classification_df['label'].unique()))
    model_directory = os.path.join(root_dir, "classify_state_dict")
Ejemplo n.º 4
0
def run(mtd="fold_split"):
    def _eval(data):
        model.eval()  # 不启用 BatchNormalization 和 Dropout
        # data = dev_data
        y_pred = []
        y_true = []
        with torch.no_grad():
            for batch_data in dataset_processer.data_iter(
                    data, config['test_batch_size'], shuffle=False):
                torch.cuda.empty_cache()
                batch_inputs, batch_labels = dataset_processer.batch2tensor(
                    batch_data)
                batch_outputs = model(batch_inputs)
                y_pred.extend(
                    torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist())
                y_true.extend(batch_labels.cpu().numpy().tolist())

            score, dev_f1 = scores.get_score(y_true, y_pred)
        return score, dev_f1

    if mtd == "fold_split":
        demo_preprocess.split_dataset(raw_path, train_path, dev_path,
                                      test_path)
    elif mtd == "process_data":
        demo_preprocess.process_data(config, train_path, dev_path)
    elif mtd == "train":
        Train_data = file_utils.read_json(config["train_set"])
        Dev_data = file_utils.read_json(config["dev_set"])
        # 生成模型可处理的格式
        train_data = dataset_processer.get_examples(Train_data, label_encoder)
        dev_data = dataset_processer.get_examples(Dev_data, label_encoder)
        del Train_data, Dev_data
        # 一个epoch的batch个数
        batch_num = int(
            np.ceil(len(train_data) / float(config["train_batch_size"])))
        print("batch_num:{}".format(batch_num))
        # model = BertSoftmaxModel(cfg.bert_path, label_encoder)
        optimizer = Optimizer(model.all_parameters,
                              steps=batch_num * config["epochs"])  # 优化器

        # loss
        # criterion = nn.CrossEntropyLoss()  # obj
        criterion = loss_factory.focal_loss()
        best_train_f1, best_dev_f1 = 0, 0
        early_stop = -1
        EarlyStopEpochs = 10  # 当多个epoch,dev的指标都没有提升,则早停
        # train
        print("start train")
        for epoch in range(cfg.RESUME_EPOCH + 1, config["epochs"] + 1):
            optimizer.zero_grad()
            model.train()  # 启用 BatchNormalization 和 Dropout
            overall_losses = 0
            losses = 0
            # batch_idx = 1
            y_pred = []
            y_true = []
            step = 0
            for batch_data in dataset_processer.data_iter(
                    train_data, config["train_batch_size"], shuffle=True):
                torch.cuda.empty_cache()
                batch_inputs, batch_labels = dataset_processer.batch2tensor(
                    batch_data)
                batch_outputs = model(batch_inputs)
                print(batch_outputs.shape)  #
                loss = criterion(batch_outputs, batch_labels)
                loss.backward()

                loss_value = loss.detach().cpu().item()
                losses += loss_value
                overall_losses += loss_value

                y_pred.extend(
                    torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist())
                y_true.extend(batch_labels.cpu().numpy().tolist())

                # nn.utils.clip_grad_norm_(optimizer.all_params, max_norm=config["clip"])  # 梯度裁剪
                for cur_optim, scheduler in zip(optimizer.optims,
                                                optimizer.schedulers):
                    cur_optim.step()
                    scheduler.step()
                optimizer.zero_grad()
                step += 1
                # print(step, time.time())
            overall_losses /= batch_num
            overall_losses = scores.reformat(overall_losses, 4)
            score, train_f1 = scores.get_score(y_true, y_pred)
            print("epoch:{},train_score:{}, train_f1:{}, overall_loss:{} ".
                  format(epoch, train_f1, score, overall_losses))
            # if set(y_true) == set(y_pred):
            #     print("report")
            #     report = classification_report(y_true, y_pred, digits=4, target_names=label_encoder.target_names)
            #     # logging.info('\n' + report)
            #     print(report)

            # eval
            _, dev_f1 = _eval(data=dev_data)

            if best_dev_f1 < dev_f1:
                best_dev_f1 = dev_f1
                early_stop = 0
                best_train_f1 = train_f1
                save_path = model_utils.save_checkpoint(
                    model,
                    epoch,
                    save_folder=os.path.join(cfg.proj_path, "data/bert_nn"))
                print("save_path:{}".format(save_path))
                # torch.save(model.state_dict(), save_model)
            else:
                early_stop += 1
                if early_stop == EarlyStopEpochs:  # 达到早停次数,则停止训练
                    break
            print(
                "early_stop:{}, score:{}, dev_f1:{}, best_train_f1:{}, best_dev_f1:{}"
                .format(early_stop, dev_f1, score, best_train_f1, best_dev_f1))
Ejemplo n.º 5
0
import os
import pandas as pd

from utils.file_utils import write_json_dict, read_json
import numpy as np

root_dir = "/home/charan/DATA/311_Data/"
dept_path = os.path.join(root_dir, "Department/department.csv")
prob_path = os.path.join(root_dir, "Problem/category.csv")
df_path = os.path.join(root_dir, "311_VIZ_DESCRIPTION.csv")
dept_json = os.path.join(root_dir, 'Department/parent_map.json')
prob_json = os.path.join(root_dir, 'Problem/parent_map.json')
dept_class_json = os.path.join(root_dir, 'Department/class.json')
prob_class_json = os.path.join(root_dir, 'Problem/class.json')
dept_parent = read_json(dept_json)
prob_parent = read_json(prob_json)
parent_df_path = os.path.join(root_dir, "311_VIZ_DESCRIPTION_PARENT.csv")
dept_class_dict = read_json(dept_class_json)
prob_class_dict = read_json(prob_class_json)
class_df_path = os.path.join(root_dir, "311_VIZ_DESCRIPTION_PARENT_CLASS.csv")


def class_json(input, output):
    class_dict = {}
    counter = 0
    for key in list(input.keys()):
        class_dict[str(counter)] = key
        class_dict[key] = counter
        counter += 1
    write_json_dict(class_dict, output)
Ejemplo n.º 6
0
# root_dir = "/home/charan/DATA/311_Data/multi-level-classification"
# final_data = os.path.join(root_dir, "balanced_multi-level.csv")
# cat_json = os.path.join(root_dir, "category_class.json")
# type_json = os.path.join(root_dir, "type_class.json")
# load_model_path = ""
# label_cat = read_json(cat_json)
# label_type = read_json(type_json)

root_dir = "/home/charan/DATA/Data/DB_Pedia/archive/multi_level_classification"
# root_dir = "/home/charan/DATA/311_Data/multi-level-feature-extracted"
final_data = os.path.join(root_dir, "DBP_wiki_data_scaled_updated.csv")
final_data_updated = os.path.join(root_dir, "DBP_wiki_data_scaled_updated.csv")
l1_json = os.path.join(root_dir, "l1.json")
l2_json = os.path.join(root_dir, "l2.json")
load_model_path = "/home/charan/DATA/Data/DB_Pedia/archive/multi_level_classification/classify_dict_18.pt"
l1_json = read_json(l1_json)
l2_json = read_json(l2_json)


def get_classes(input_dict):
    counter = 0
    while str(counter) in input_dict:
        counter += 1
    return counter


def setup_data(input_data):
    input_data['label1'] = input_data.PARENT_CATEGORY.apply(
        lambda x: l1_json[x])
    input_data['label2'] = input_data.TYPE.apply(lambda x: l2_json[x])
    input_data['u_id'] = input_data.index
Ejemplo n.º 7
0
 def test_read_json(self):
     json_obj = file_utils.read_json('tests/sdk/test-data/test_json')
     self.assertIsNotNone(json_obj)
     self.assertIsNotNone(json_obj.get('requests'))
Ejemplo n.º 8
0
import logging
from logging.handlers import RotatingFileHandler
import os

from utils.file_utils import read_json

config_path = os.path.join(os.getcwd(), 'configuration.json')
config_data = read_json(config_path)


class AppLogger:
    __instance = None

    @staticmethod
    def log_setup():
        log_handler = logging.handlers.WatchedFileHandler(config_data["log_path"])
        formatter = logging.Formatter('%(asctime)s [%(process)d]: %(message)s', '%b %d %H:%M:%S')
        log_handler.setFormatter(formatter)
        console_handler = logging.StreamHandler()
        console_handler.setFormatter(formatter)
        logger = logging.getLogger()
        logger.addHandler(console_handler)
        logger.addHandler(log_handler)
        logger.setLevel(logging.DEBUG)
        return logger

    @staticmethod
    def getInstance():
        """ Static access method. """
        if AppLogger.__instance is None:
            logger = AppLogger.log_setup()
Ejemplo n.º 9
0
import pandas as pd
import os

from utils.file_utils import read_json

root_dir = "/home/charan/Documents/workspaces/python_workspaces/Data/BDA_Project"
mapping_dict = os.path.join(
    root_dir, "Sentiment_Financial_Data/label_mapping_dict.json")
news_data_path = os.path.join(root_dir, "news_data/news_with_summary.csv")
news_classify_path = os.path.join(root_dir,
                                  "news_data/news_classification.csv")
classification_processed = os.path.join(root_dir,
                                        "news_data/processed_sentiments.csv")
merged_final = os.path.join(root_dir, "news_data/merged_final_news.csv")

mapping_dict = read_json(mapping_dict)


def final_review_companies():
    with open('Review.json', 'r') as f:
        companies_dict = json.load(f)
        f.close()
    return companies_dict


def extract_news_company_data():
    companies = final_review_companies()
    news_data = load_dataframe()
    list_companies = list(set(companies.keys()))
    df_dict = {}
    for each_company in list_companies: