class DecisionTree:
    def __init__(self):
        self.log = LoggerUtil(self.__class__.__name__).get()
        self.config = ConfigUtil.get_config_instance()
        self.helper = Helper()
        self.metrics = Metrics()

    @staticmethod
    def train_model(x_train, y_train):
        model = DecisionTreeClassifier()
        model.fit(x_train, y_train)
        return model

    @staticmethod
    def test_model(model, x_test):
        return model.predict(x_test)

    def main(self, x_train, x_test, y_train, y_test):
        image_path = self.config["image_path"]
        model = self.train_model(x_train, y_train)

        self.log.info("{} Model performance on test data".format(self.__class__.__name__))
        y_pred = self.test_model(model, x_test)
        acc_score, cr_report, cnf_matrix = self.metrics.metrics(y_true=y_test, y_predicted=y_pred)
        self.helper.plot_save_cnf_matrix(cnf_matrix, flag="test", model_name=self.__class__.__name__,
                                         image_path=image_path)
        return {
            "model": model,
            "metrics": {
                "accuracy": acc_score,
                "classification_report": cr_report,
                "confusion_matrix": cnf_matrix
            }
        }
예제 #2
0
class UpdateProfile:
    def __init__(self):
        self.log = LoggerUtil(self.__class__.__name__).get()
        self.config = ConfigUtil.get_config_instance()
        self.db_utils = DBUtils()

    def get_client(self):
        address = self.config['mongo']['address']
        port = self.config['mongo']['port']
        auth_db = self.config['mongo']['auth_db']
        is_auth_enabled = self.config['mongo']['is_auth_enabled']
        username = self.config['mongo']['username']
        password = self.config['mongo']['password']

        client = self.db_utils.get_client(address=address,
                                          port=port,
                                          username=username,
                                          password=password,
                                          auth_db=auth_db,
                                          is_auth_enabled=is_auth_enabled)
        return client

    @staticmethod
    def insert_query(source, destination, time, date, num_seats_req, phone_num,
                     email, preferences):
        query = dict()
        query['source'] = source
        query['destination'] = destination
        query['time'] = time
        query['date'] = date
        query['num_seats_req'] = num_seats_req
        query['phone_num'] = phone_num
        query['email'] = email
        query['preferences'] = preferences
        return json.dumps(query)

    def update(self, source, destination, time, date, num_seats_req, phone_num,
               email, preferences):
        client = self.get_client()

        users_database_name = self.config['mongo']['users_database']
        users_hist_collection_name = self.config['mongo'][
            'users_hist_collection_name']
        database = client[users_database_name]
        users_hist_collection = database[users_hist_collection_name]

        query = self.insert_query(source=source,
                                  destination=destination,
                                  time=time,
                                  date=date,
                                  num_seats_req=num_seats_req,
                                  phone_num=phone_num,
                                  email=email,
                                  preferences=preferences)
        try:
            users_hist_collection.insert(query)
            self.log.info(
                "Updated profile for user with email : {}".format(email))
        except Exception as e:
            self.log.error("Error : {}".format(e))
예제 #3
0
class Metrics:
    def __init__(self):
        self.config = ConfigUtil.get_config_instance()
        self.log = LoggerUtil(self.__class__.__name__).get()

    def metrics(self, y_true, y_predicted):
        cr_report = classification_report(y_true=y_true, y_pred=y_predicted)
        acc_score = accuracy_score(y_true=y_true, y_pred=y_predicted)
        self.log.info("Accuracy Score : {}".format(acc_score))
        self.log.info("Classification Report: \n{}".format(cr_report))
        cnf_matrix = confusion_matrix(y_true=y_true, y_pred=y_predicted)
        return acc_score, cr_report, cnf_matrix
예제 #4
0
class UserRegistration:
    def __init__(self):
        self.log = LoggerUtil(self.__class__.__name__).get()
        self.config = ConfigUtil.get_config_instance()
        self.db_utils = DBUtils()

    def get_client(self):
        address = self.config['mongo']['address']
        port = self.config['mongo']['port']
        auth_db = self.config['mongo']['auth_db']
        is_auth_enabled = self.config['mongo']['is_auth_enabled']
        username = self.config['mongo']['username']
        password = self.config['mongo']['password']

        client = self.db_utils.get_client(address=address,
                                          port=port,
                                          username=username,
                                          password=password,
                                          auth_db=auth_db,
                                          is_auth_enabled=is_auth_enabled)
        return client

    @staticmethod
    def insert_query(name, email, contact, password):
        query = dict()
        query['name'] = name
        query['email'] = email
        query['contact'] = contact
        query['password'] = password
        query['auth'] = True
        return query

    def add_user(self, **kwargs):
        name = kwargs['name']
        email = kwargs['email']
        contact = kwargs['contact']
        password = kwargs['password']

        client = self.get_client()

        users_database_name = self.config['mongo']['users_database']
        add_users_collection_name = self.config['mongo'][
            'add_users_collection']
        database = client[users_database_name]
        add_users_collection = database[add_users_collection_name]

        query = self.insert_query(name, email, contact, password)
        try:
            add_users_collection.insert(query)
            self.log.info("Added user with username : {}".format(email))
        except Exception as e:
            self.log.error("Error : {}".format(e))
class TestModel:
    def __init__(self):
        self.log = LoggerUtil(self.__class__.__name__).get()
        self.config = ConfigUtil.get_config_instance()
        self.model = LinguisticModel()
        self.read_data = ReadData()
        self.metrics = Metrics()
        self.helper = Helper()
        self.logic = Logic()

    def check_if_trained(self):
        models_path = self.config["models_path"]
        model = joblib.load(models_path + "/" + "mlp.mdl")
        vectorizer = joblib.load(models_path + "/" + "vectorizer.mdl")
        return vectorizer, model

    def main(self, test=False):
        if test:
            nlp = load("en_core_web_sm")
            vectorizer, model = self.check_if_trained()
            self.log.info("Please enter the sentence")
            sentence = str(input())
            tokens = self.read_data.transform_sentence(sentence)
            features = vectorizer.transform(tokens)
            predictions = model.predict(features)
            ling_pred = self.logic.apply_rules(text_tokens=tokens, nlp=nlp)
            self.log.info("Given sentence : {}".format(sentence))
            self.log.info(
                "Prediction of Linguistic Model : {}".format(ling_pred))
            self.log.info("Prediction of ML Model : {}".format(
                any(predictions)))
            self.log.info("Final Prediction : {}".format(ling_pred
                                                         or any(predictions)))
        else:
            model, vectorizer = self.model.main()
            tagged_data_df = self.read_data.prepare_tagged_data()
            features = vectorizer.transform(tagged_data_df["data"])
            labels = tagged_data_df["labels"]
            predictions = model.predict(features)

            acc_score, cr_report, cnf_matrix = self.metrics.metrics(
                y_true=labels, y_predicted=predictions)
            self.helper.plot_save_cnf_matrix(
                cnf_matrix=cnf_matrix,
                model_name="satwik",
                flag="test",
                image_path=
                "/home/satwik/Documents/Hiring/huddl_assignment/Images/")
class ReadData:
    def __init__(self):
        self.log = LoggerUtil(self.__class__.__name__).get()
        self.config = ConfigUtil.get_config_instance()
        self.process_data = ProcessData()
        self.helper = Helper()
        self.ps = PorterStemmer()
        self.stop_words = set.union(STOP_WORDS, {
            'ect', 'hou', 'com', 'recipient', 'na', 'ou', 'cn', 'enron',
            'zdnet'
        })

    def prepare_data(self, n_rows):
        data_path = self.config["data_path"]
        chunksize = self.config["chunksize"]
        n_partitions = self.config["n_partitions"]
        df_list = list()
        chunk = read_csv(data_path + "/" + "emails.csv", chunksize=chunksize)
        for i in range(int(n_rows / chunksize)):
            self.log.info("Iter : {}".format(i))
            df = next(chunk)
            df = self.process_data.get_sub_message(df)
            self.process_data.clean_df(df, n_partitions)
            df_list.append(df)
            self.log.info("Label Stats : {}".format(Counter(df["labels"])))
        return concat(df_list)

    def prepare_tagged_data(self):
        tagged_path = self.config["tagged_path"]
        tagged_data = read_csv(tagged_path + "/" + "actions.csv", header=None)
        rows = len(tagged_data)
        tagged_data[0] = tagged_data[0].apply(lambda x: [
            "".join(item.lower().strip()) for item in x.split()
            if x.lower().strip() not in self.stop_words
        ])
        df = DataFrame.from_dict({
            "data": tagged_data[0].values,
            "labels": ones(rows, dtype=bool)
        })
        return df

    def transform_sentence(self, sentence):
        return [
            item.lower().strip() for item in sentence.split()
            if item.lower().strip() not in self.stop_words
        ]
class CheckPreTaggedData:
    def __init__(self):
        self.log = LoggerUtil(self.__class__.__name__).get()
        self.config = ConfigUtil.get_config_instance()
        self.logic = Logic()

    def main(self):
        nlp = load("en_core_web_sm")
        stop_words = set.union(STOP_WORDS, {
            'ect', 'hou', 'com', 'recipient', 'na', 'ou', 'cn', 'enron',
            'zdnet'
        })
        tagged_df = read_csv(self.config["tagged_path"] + "/" + "actions.csv",
                             header=None)
        tagged_df[0] = tagged_df[0].apply(lambda x: [
            item.lower().strip() for item in x.split()
            if item.lower().strip() not in stop_words
        ])
        tagged_df["labels"] = tagged_df[0].apply(
            lambda x: self.logic.apply_rules(x, nlp))
        self.log.info(
            "Values detected by Model in Pre-Tagged Sentences : {}".format(
                tagged_df.labels.value_counts()))
class LinguisticModel:
    def __init__(self):
        self.log = LoggerUtil(self.__class__.__name__).get()
        self.config = ConfigUtil.get_config_instance()
        self.read_data = ReadData()
        self.bayes = NaiveBayes()
        self.lr = LRModel()
        self.dt = DecisionTree()
        self.rf = RandomForest()
        self.et = ExtraTree()
        self.adaboost = Adaboost()
        self.mlp = MLP()

    @staticmethod
    def split_data(df, labels):
        x_train, x_test, y_train, y_test = train_test_split(df,
                                                            labels,
                                                            stratify=labels,
                                                            shuffle=True)
        return x_train, x_test, y_train, y_test

    @staticmethod
    def custom_word_tokenizer(tokens):
        return tokens

    def vectorize_data(self, data):
        """
        While we needed subject to estimate if the mail contains action item or not,
        we don't need it for model selection.
        :return:
        """
        vectorizer = TfidfVectorizer(tokenizer=self.custom_word_tokenizer,
                                     sublinear_tf=True,
                                     analyzer='word',
                                     lowercase=False,
                                     max_features=2500)

        vectorizer.fit(data)
        return vectorizer

    @staticmethod
    def get_tagged_dataset(df):
        return df[~df["labels"]]["message"].head(n=1250), zeros(1250,
                                                                dtype=bool)

    def train_models(self, x_train_features, x_test_features, y_train, y_test):
        nb_model = self.bayes.main(x_train=x_train_features,
                                   y_train=y_train,
                                   x_test=x_test_features,
                                   y_test=y_test)
        lr_model = self.lr.main(x_train=x_train_features,
                                y_train=y_train,
                                x_test=x_test_features,
                                y_test=y_test)
        dt_model = self.dt.main(x_train=x_train_features,
                                y_train=y_train,
                                x_test=x_test_features,
                                y_test=y_test)
        rf_model = self.rf.main(x_train=x_train_features,
                                y_train=y_train,
                                x_test=x_test_features,
                                y_test=y_test)
        et_model = self.et.main(x_train=x_train_features,
                                y_train=y_train,
                                x_test=x_test_features,
                                y_test=y_test)
        adaboost_model = self.adaboost.main(x_train=x_train_features,
                                            y_train=y_train,
                                            x_test=x_test_features,
                                            y_test=y_test)
        mlp_model = self.mlp.main(x_train=x_train_features,
                                  y_train=y_train,
                                  x_test=x_test_features,
                                  y_test=y_test)

        model_dict = {
            "naive_bayes": nb_model,
            "logistic_regression": lr_model,
            "decision_tree": dt_model,
            "random_forest": rf_model,
            "extra_tree": et_model,
            "adaboost": adaboost_model,
            "mlp": mlp_model
        }
        return model_dict

    def save_models(self, model_dict, models_path, vectorizer):
        joblib.dump(vectorizer, models_path + "/" + "vectorizer.mdl")
        for model_name, stat_dict in model_dict.items():
            if model_name in [
                    "naive_bayes", "logistic_regression", "decision_tree",
                    "random_forest", "extra_tree", "adaboost", "mlp"
            ]:
                joblib.dump(stat_dict["model"],
                            models_path + "/" + model_name + ".mdl")
            else:
                self.log.error("Non-Standard Model referenced")

    def get_best_model(self, model_dict):
        best_acc = 0
        best_model_name = ""
        for model_name, model_stat in model_dict.items():
            for key, value in model_stat.items():
                if key == "metrics":
                    accuracy = value["accuracy"]
                    if accuracy > best_acc:
                        best_model_name = model_name

        self.log.info("Best model determined is : {}".format(best_model_name))
        return model_dict[best_model_name]["model"]

    def main(self):
        models_path = self.config["models_path"]
        untagged_data_df = self.read_data.prepare_data(n_rows=3000)
        untagged_data, untagged_labels = untagged_data_df[
            "message"], untagged_data_df["labels"]

        vectorizer = self.vectorize_data(untagged_data)
        x_train, x_test, y_train, y_test = self.split_data(
            untagged_data, untagged_labels)
        x_train_features = vectorizer.transform(x_train)
        x_test_features = vectorizer.transform(x_test)

        model_dict = self.train_models(x_train_features, x_test_features,
                                       y_train, y_test)
        self.save_models(model_dict=model_dict,
                         models_path=models_path,
                         vectorizer=vectorizer)
        best_model = self.get_best_model(model_dict)
        return best_model, vectorizer