Ejemplo n.º 1
0
    def start(self):
        print("handle page : " + self.path)
        self.url = str.format(self.urlTemplate, host=self.host, path=self.path)
        res = None
        if (self.inCompany):
            res = requests.get(self.url,
                               headers=self.headers,
                               proxies=self.proxies)
        else:
            res = requests.get(self.url, headers=self.headers)

        if (res.status_code == 200):
            res.encoding = 'UTF-8'
            pageDom = BeautifulSoup(res.text, features="lxml-xml")
            dbName = "NovelMongo"
            books = self.parsePageBook(pageDom)
            self.dataAccess.store_novel_base_infos(books, dbName)
            allPage = self.parseAllPage(pageDom)
            allPage.remove(self.path)
            Helper.sleep_while(scends=20)
            for page in allPage:
                pageBooks = self.getPageBook(page)
                books = books + pageBooks
                self.dataAccess.store_novel_base_infos(pageBooks, dbName)
                percent = ((allPage.index(page) + 1) / allPage.__len__()) * 100
                print("Process : %.2f%%" % percent)
                if (percent < 100):
                    Helper.sleep_while(scends=20)

            self.dataAccess.find_all(dbName, "BookBaseInfo")
class DecisionTree:
    def __init__(self):
        self.log = LoggerUtil(self.__class__.__name__).get()
        self.config = ConfigUtil.get_config_instance()
        self.helper = Helper()
        self.metrics = Metrics()

    @staticmethod
    def train_model(x_train, y_train):
        model = DecisionTreeClassifier()
        model.fit(x_train, y_train)
        return model

    @staticmethod
    def test_model(model, x_test):
        return model.predict(x_test)

    def main(self, x_train, x_test, y_train, y_test):
        image_path = self.config["image_path"]
        model = self.train_model(x_train, y_train)

        self.log.info("{} Model performance on test data".format(self.__class__.__name__))
        y_pred = self.test_model(model, x_test)
        acc_score, cr_report, cnf_matrix = self.metrics.metrics(y_true=y_test, y_predicted=y_pred)
        self.helper.plot_save_cnf_matrix(cnf_matrix, flag="test", model_name=self.__class__.__name__,
                                         image_path=image_path)
        return {
            "model": model,
            "metrics": {
                "accuracy": acc_score,
                "classification_report": cr_report,
                "confusion_matrix": cnf_matrix
            }
        }
 def __init__(self):
     self.log = LoggerUtil(self.__class__.__name__).get()
     self.config = ConfigUtil.get_config_instance()
     self.model = LinguisticModel()
     self.read_data = ReadData()
     self.metrics = Metrics()
     self.helper = Helper()
     self.logic = Logic()
class TestModel:
    def __init__(self):
        self.log = LoggerUtil(self.__class__.__name__).get()
        self.config = ConfigUtil.get_config_instance()
        self.model = LinguisticModel()
        self.read_data = ReadData()
        self.metrics = Metrics()
        self.helper = Helper()
        self.logic = Logic()

    def check_if_trained(self):
        models_path = self.config["models_path"]
        model = joblib.load(models_path + "/" + "mlp.mdl")
        vectorizer = joblib.load(models_path + "/" + "vectorizer.mdl")
        return vectorizer, model

    def main(self, test=False):
        if test:
            nlp = load("en_core_web_sm")
            vectorizer, model = self.check_if_trained()
            self.log.info("Please enter the sentence")
            sentence = str(input())
            tokens = self.read_data.transform_sentence(sentence)
            features = vectorizer.transform(tokens)
            predictions = model.predict(features)
            ling_pred = self.logic.apply_rules(text_tokens=tokens, nlp=nlp)
            self.log.info("Given sentence : {}".format(sentence))
            self.log.info(
                "Prediction of Linguistic Model : {}".format(ling_pred))
            self.log.info("Prediction of ML Model : {}".format(
                any(predictions)))
            self.log.info("Final Prediction : {}".format(ling_pred
                                                         or any(predictions)))
        else:
            model, vectorizer = self.model.main()
            tagged_data_df = self.read_data.prepare_tagged_data()
            features = vectorizer.transform(tagged_data_df["data"])
            labels = tagged_data_df["labels"]
            predictions = model.predict(features)

            acc_score, cr_report, cnf_matrix = self.metrics.metrics(
                y_true=labels, y_predicted=predictions)
            self.helper.plot_save_cnf_matrix(
                cnf_matrix=cnf_matrix,
                model_name="satwik",
                flag="test",
                image_path=
                "/home/satwik/Documents/Hiring/huddl_assignment/Images/")
Ejemplo n.º 5
0
 def __init__(self):
     self.log = LoggerUtil(self.__class__.__name__).get()
     self.config = ConfigUtil.get_config_instance()
     self.process_data = ProcessData()
     self.helper = Helper()
     self.ps = PorterStemmer()
     self.stop_words = set.union(STOP_WORDS, {
         'ect', 'hou', 'com', 'recipient', 'na', 'ou', 'cn', 'enron',
         'zdnet'
     })
Ejemplo n.º 6
0
    def store_novel_base_infos(self, books: list, db_name: str):
        db = self._connect_(db_name)
        bookTable = db.BookBaseInfo
        for book in books:
            try:
                condition = {'bookId': Helper.md5_hash(book.name, False).MD5}
                record = bookTable.find_one(condition)
                if(record is None):
                    bookTable.insert_one(book.__dict__)  # 转换为 dict
                    self.logger.info("Insert Success:" + book.name)
                else:
                    self.logger.warning("Insert Failed: Duplicate id --" +book.name)
            except DuplicateKeyError as dke:
                self.logger.error("Insert Failed: DuplicateKeyError objectId --\n"+dke)

        self._close_()
Ejemplo n.º 7
0
 def parsePageBook(self, dom: BeautifulSoup):
     """
     get book(name,link) of per page
     :param dom: BeautifulSoup Dom
     :return: Book list
     """
     titleDoms = dom.select('ul.ditu > li')
     books = []
     for titleDom in titleDoms:
         book = BookBase()
         book.link = titleDom.select('a')[0]['href']
         book.name = titleDom.select('a')[0].text
         md5Info = Helper.md5_hash(book.name, False)
         book.bookId = md5Info.MD5
         book.salt = bson.binary.Binary(md5Info.Salt)
         books.append(book)
     return list(books)
Ejemplo n.º 8
0
 def store_novel_base_info(self, book: BookBase, db_name: str):
     db = self._connect_(db_name)
     bookTable = db.BookBaseInfo
     condition = {'bookId': Helper.md5_hash(book.name, False).MD5}
     print(condition)
     record = bookTable.find_one(condition)
     print(record)
     if(record is None):
         # insert
         print(book.__dict__)
         insertRes = bookTable.insert_one(book.__dict__)
         self.logger.info("Insert Success: "+str(insertRes.inserted_id))
     else:
         # update
         insertRes = bookTable.update(condition, book.__dict__)
         self.logger.info("Update Success: "+str(insertRes))
     self._close_()
    def train_mlp(sess, model, X_train, y_train, num_epochs=100):
        sess.run(model.init_op)

        for epoch in range(num_epochs):
            training_costs = []

            batch_generator = Helper.create_batch_generator(X_train,
                                                            y_train,
                                                            batch_size=128)

            for batch_X, batch_y in batch_generator:
                feed = {model.tf_x: batch_X, model.tf_y: batch_y}
                _, batch_cost = sess.run([model.train_op, model.cost],
                                         feed_dict=feed)
                training_costs.append(batch_cost)

            print('-- Epoch %2d '
                  'Avg Training Loss: %4f' %
                  (epoch + 1, np.mean(training_costs)))
        return training_costs
Ejemplo n.º 10
0
class HeuristicRules:
    """
    These rules are adapted from the paper
    "Identifying Business Tasks and Commitments from Email and Chat Conversations"
    TODO: Parsing based on POS tags. Can we use NLTK Regular Expressions for better efficiency?
    """
    def __init__(self):
        self.log = LoggerUtil(self.__class__.__name__).get()
        self.helper = Helper()

    @staticmethod
    def match2(pos, tag1, tag2):
        for i, j in zip(pos, pos[1:]):
            if i[1] == tag1 and j[1] == tag2:
                return " ".join([i[0], j[0]])
        return ""

    @staticmethod
    def match3(pos, tag1, tag2, tag3):
        for x, y, z in zip(pos, pos[1:], pos[2:]):
            if x[1] == tag1 and y[1] == tag2 and z[1] == tag3:
                return " ".join([x[0], y[0], z[0]])
        return ""

    @staticmethod
    def match4(pos, tag1, tag2, tag3, tag4):
        for p, q, r, s in zip(pos, pos[1:], pos[2:], pos[3:]):
            if p[1] == tag1 and q[1] == tag2 and r[1] == tag3 and s == tag4:
                return " ".join([p[0], q[0], r[0], s[0]])
        return ""

    def rule_1(self, pos):
        """
        The negative verb indicates the presence of a cancelled action.
        For example, in the sentence "Please do not pass the cheque",
        do not is a negative verb but its an actionable sentence none the less.

        (PRP/PRP$) (MD) (RB) (VB/VBD/VBG/VBN/VBP/VBZ)
        (MD) (RB) (PRP/PRP$) (VB/VBD/VBG/VBN/VBP/VBZ)
        :return:
        """
        match_list = list()

        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "PRP", "MD", "RB", "VB"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "PRP", "MD", "RB", "VBD"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "PRP", "MD", "RB", "VBG"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "PRP", "MD", "RB", "VBN"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "PRP", "MD", "RB", "VBP"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "PRP", "MD", "RB", "VBZ"))))

        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "PRP$", "MD", "RB", "VB"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "PRP$", "MD", "RB", "VBD"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "PRP$", "MD", "RB", "VBG"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "PRP$", "MD", "RB", "VBN"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "PRP$", "MD", "RB", "VBP"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "PRP$", "MD", "RB", "VBZ"))))

        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "MD", "RB", "PRP", "VB"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "MD", "RB", "PRP", "VBD"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "MD", "RB", "PRP", "VBG"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "MD", "RB", "PRP", "VBN"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "MD", "RB", "PRP", "VBP"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "MD", "RB", "PRP", "VBZ"))))

        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "MD", "RB", "PRP$", "VB"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "MD", "RB", "PRP$", "VBD"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "MD", "RB", "PRP$", "VBG"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "MD", "RB", "PRP$", "VBN"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "MD", "RB", "PRP$", "VBP"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match4(pos, "MD", "RB", "PRP$", "VBZ"))))

        return any(match_list)

    def rule_2(self, pos):
        """
        A modal verb signals the creation of a commitment.
        For example, the sentence "He will handle the issuance of the LC". contains a modal verb "will"
        It indicates the creation of a commitment.

        (MD) (PRP/PRP$) (VB/VBD/VBG/VBN/VBP/VBZ)
        (PRP/PRP$) (MD) (VB/VBD/VBG/VBN/VBP/VBZ)
        :param pos:
        :return:
        """
        match_list = list()
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match3(pos, "MD", "PRP", "VB"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match3(pos, "MD", "PRP", "VBD"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match3(pos, "MD", "PRP", "VBG"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match3(pos, "MD", "PRP", "VBN"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match3(pos, "MD", "PRP", "VBP"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match3(pos, "MD", "PRP", "VBZ"))))

        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match3(pos, "MD", "PRP$", "VB"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match3(pos, "MD", "PRP$", "VBD"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match3(pos, "MD", "PRP$", "VBG"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match3(pos, "MD", "PRP$", "VBN"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match3(pos, "MD", "PRP$", "VBP"))))
        match_list.append(
            self.helper.is_list_not_empty(
                list(self.match3(pos, "MD", "PRP$", "VBZ"))))
        return any(match_list)

    def rule_3(self, pos, text_tokens):
        """
        The bi-gram of "please" and an action verb indicates a directive.
        For example, in the sentence "Please review and send along to your attorney as soon as possible",
        the bi-gram please review indicates a directive commitment creation.
        :return:
        """
        match_list = list()

        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NN", "VB"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NN", "VBD"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NN", "VBG"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NN", "VBN"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NN", "VBP"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NN", "VBZ"))))

        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNS", "VB"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNS",
                                                           "VBD"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNS",
                                                           "VBG"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNS",
                                                           "VBN"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNS",
                                                           "VBP"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNS",
                                                           "VBZ"))))

        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNP", "VB"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNP",
                                                           "VBD"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNP",
                                                           "VBG"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNP",
                                                           "VBN"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNP",
                                                           "VBP"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNP",
                                                           "VBZ"))))

        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNPS",
                                                           "VB"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNPS",
                                                           "VBD"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNPS",
                                                           "VBG"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNPS",
                                                           "VBN"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNPS",
                                                           "VBP"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "NNPS",
                                                           "VBZ"))))

        if 'please' in text_tokens:
            match_list.append(True)

        return any(match_list)

    def rule_4(self, pos):
        """
        A verb followed by pronoun is an action statement
        For example, consider the sentence "Call him"
        :param pos:
        :return:
        """
        match_list = list()

        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "VB", "PRP"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "VBD",
                                                           "PRP"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "VBG",
                                                           "PRP"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "VBN",
                                                           "PRP"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "VBP",
                                                           "PRP"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "VBZ",
                                                           "PRP"))))

        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "VB",
                                                           "PRP$"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "VBD",
                                                           "PRP$"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "VBG",
                                                           "PRP$"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "VBN",
                                                           "PRP$"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "VBP",
                                                           "PRP$"))))
        match_list.append(
            self.helper.is_list_not_empty(list(self.match2(pos, "VBZ",
                                                           "PRP$"))))

        return any(match_list)

    @staticmethod
    def rule_5(text_tokens):
        """
        A question mark in a sentence indicates a directive commitment creation.
        :return:
        """
        return "?" in text_tokens

    @staticmethod
    def rule_6(text_tokens):
        """
        Common abbreviations and acronyms such as ASAP, RSVP, ETA, ETD, ET.
        :return:
        """
        match_list = list()

        match_list.append(['asap' in text_tokens])
        match_list.append(['a.s.a.p' in text_tokens])
        match_list.append(['rsvp' in text_tokens])
        match_list.append(['r.s.v.p' in text_tokens])
        match_list.append(['eta' in text_tokens])
        match_list.append(['e.t.a' in text_tokens])
        match_list.append(['etd' in text_tokens])
        match_list.append(['e.t.d' in text_tokens])
        match_list.append(['et' in text_tokens])
        match_list.append(['e.t' in text_tokens])

        return any(match_list)

    @staticmethod
    def rule_7(nlp, text_tokens):
        """
        Anything which involves completion of a task in the future is an action sentence.
        For examples, in the sentence "It will be posted today and the policy should be drafted by Friday".
        Here today and Friday are the deadlines.
        :return:
        """
        match_list = list()
        doc = nlp(" ".join(text_tokens))
        for X in doc:
            if X.ent_type_ == "TIME":
                match_list.append(True)
        return any(match_list)
Ejemplo n.º 11
0
 def __init__(self):
     self.log = LoggerUtil(self.__class__.__name__).get()
     self.helper = Helper()
Ejemplo n.º 12
0
 def __init__(self):
     self.log = LoggerUtil(self.__class__.__name__).get()
     self.config = ConfigUtil.get_config_instance()
     self.helper = Helper()
     self.metrics = Metrics()
from Models.Tensorflow import LayersMultiLayerPerceptron2_50
from Utils.Helper import Helper
from sklearn.model_selection import train_test_split

if __name__ == "__main__":
    targets = get_bcgw_targets()

    data = rb("data", "data_img", "data_bcgw", targets)
    X = data.Combined.Data()
    y = data.Onehot.Data()
    X = X.reshape(164410, 23)

    # split and normalize data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    mean_vals, std_val = Helper.mean_center_normalize(X_train)

    X_train_centered = (X_train - mean_vals) / std_val
    X_test_centered = (X_test - mean_vals) / std_val

    del X_train, X_test

    print("Training Samples")
    print(X_train_centered.shape, y_train.shape)
    print()
    print('Testing Samples')
    print(X_test_centered.shape, y_test.shape)
    print()

    mlpmodel = LayersMultiLayerPerceptron2_50(X_test_centered.shape[1],
                                              len(targets),