def start(self): print("handle page : " + self.path) self.url = str.format(self.urlTemplate, host=self.host, path=self.path) res = None if (self.inCompany): res = requests.get(self.url, headers=self.headers, proxies=self.proxies) else: res = requests.get(self.url, headers=self.headers) if (res.status_code == 200): res.encoding = 'UTF-8' pageDom = BeautifulSoup(res.text, features="lxml-xml") dbName = "NovelMongo" books = self.parsePageBook(pageDom) self.dataAccess.store_novel_base_infos(books, dbName) allPage = self.parseAllPage(pageDom) allPage.remove(self.path) Helper.sleep_while(scends=20) for page in allPage: pageBooks = self.getPageBook(page) books = books + pageBooks self.dataAccess.store_novel_base_infos(pageBooks, dbName) percent = ((allPage.index(page) + 1) / allPage.__len__()) * 100 print("Process : %.2f%%" % percent) if (percent < 100): Helper.sleep_while(scends=20) self.dataAccess.find_all(dbName, "BookBaseInfo")
class DecisionTree: def __init__(self): self.log = LoggerUtil(self.__class__.__name__).get() self.config = ConfigUtil.get_config_instance() self.helper = Helper() self.metrics = Metrics() @staticmethod def train_model(x_train, y_train): model = DecisionTreeClassifier() model.fit(x_train, y_train) return model @staticmethod def test_model(model, x_test): return model.predict(x_test) def main(self, x_train, x_test, y_train, y_test): image_path = self.config["image_path"] model = self.train_model(x_train, y_train) self.log.info("{} Model performance on test data".format(self.__class__.__name__)) y_pred = self.test_model(model, x_test) acc_score, cr_report, cnf_matrix = self.metrics.metrics(y_true=y_test, y_predicted=y_pred) self.helper.plot_save_cnf_matrix(cnf_matrix, flag="test", model_name=self.__class__.__name__, image_path=image_path) return { "model": model, "metrics": { "accuracy": acc_score, "classification_report": cr_report, "confusion_matrix": cnf_matrix } }
def __init__(self): self.log = LoggerUtil(self.__class__.__name__).get() self.config = ConfigUtil.get_config_instance() self.model = LinguisticModel() self.read_data = ReadData() self.metrics = Metrics() self.helper = Helper() self.logic = Logic()
class TestModel: def __init__(self): self.log = LoggerUtil(self.__class__.__name__).get() self.config = ConfigUtil.get_config_instance() self.model = LinguisticModel() self.read_data = ReadData() self.metrics = Metrics() self.helper = Helper() self.logic = Logic() def check_if_trained(self): models_path = self.config["models_path"] model = joblib.load(models_path + "/" + "mlp.mdl") vectorizer = joblib.load(models_path + "/" + "vectorizer.mdl") return vectorizer, model def main(self, test=False): if test: nlp = load("en_core_web_sm") vectorizer, model = self.check_if_trained() self.log.info("Please enter the sentence") sentence = str(input()) tokens = self.read_data.transform_sentence(sentence) features = vectorizer.transform(tokens) predictions = model.predict(features) ling_pred = self.logic.apply_rules(text_tokens=tokens, nlp=nlp) self.log.info("Given sentence : {}".format(sentence)) self.log.info( "Prediction of Linguistic Model : {}".format(ling_pred)) self.log.info("Prediction of ML Model : {}".format( any(predictions))) self.log.info("Final Prediction : {}".format(ling_pred or any(predictions))) else: model, vectorizer = self.model.main() tagged_data_df = self.read_data.prepare_tagged_data() features = vectorizer.transform(tagged_data_df["data"]) labels = tagged_data_df["labels"] predictions = model.predict(features) acc_score, cr_report, cnf_matrix = self.metrics.metrics( y_true=labels, y_predicted=predictions) self.helper.plot_save_cnf_matrix( cnf_matrix=cnf_matrix, model_name="satwik", flag="test", image_path= "/home/satwik/Documents/Hiring/huddl_assignment/Images/")
def __init__(self): self.log = LoggerUtil(self.__class__.__name__).get() self.config = ConfigUtil.get_config_instance() self.process_data = ProcessData() self.helper = Helper() self.ps = PorterStemmer() self.stop_words = set.union(STOP_WORDS, { 'ect', 'hou', 'com', 'recipient', 'na', 'ou', 'cn', 'enron', 'zdnet' })
def store_novel_base_infos(self, books: list, db_name: str): db = self._connect_(db_name) bookTable = db.BookBaseInfo for book in books: try: condition = {'bookId': Helper.md5_hash(book.name, False).MD5} record = bookTable.find_one(condition) if(record is None): bookTable.insert_one(book.__dict__) # 转换为 dict self.logger.info("Insert Success:" + book.name) else: self.logger.warning("Insert Failed: Duplicate id --" +book.name) except DuplicateKeyError as dke: self.logger.error("Insert Failed: DuplicateKeyError objectId --\n"+dke) self._close_()
def parsePageBook(self, dom: BeautifulSoup): """ get book(name,link) of per page :param dom: BeautifulSoup Dom :return: Book list """ titleDoms = dom.select('ul.ditu > li') books = [] for titleDom in titleDoms: book = BookBase() book.link = titleDom.select('a')[0]['href'] book.name = titleDom.select('a')[0].text md5Info = Helper.md5_hash(book.name, False) book.bookId = md5Info.MD5 book.salt = bson.binary.Binary(md5Info.Salt) books.append(book) return list(books)
def store_novel_base_info(self, book: BookBase, db_name: str): db = self._connect_(db_name) bookTable = db.BookBaseInfo condition = {'bookId': Helper.md5_hash(book.name, False).MD5} print(condition) record = bookTable.find_one(condition) print(record) if(record is None): # insert print(book.__dict__) insertRes = bookTable.insert_one(book.__dict__) self.logger.info("Insert Success: "+str(insertRes.inserted_id)) else: # update insertRes = bookTable.update(condition, book.__dict__) self.logger.info("Update Success: "+str(insertRes)) self._close_()
def train_mlp(sess, model, X_train, y_train, num_epochs=100): sess.run(model.init_op) for epoch in range(num_epochs): training_costs = [] batch_generator = Helper.create_batch_generator(X_train, y_train, batch_size=128) for batch_X, batch_y in batch_generator: feed = {model.tf_x: batch_X, model.tf_y: batch_y} _, batch_cost = sess.run([model.train_op, model.cost], feed_dict=feed) training_costs.append(batch_cost) print('-- Epoch %2d ' 'Avg Training Loss: %4f' % (epoch + 1, np.mean(training_costs))) return training_costs
class HeuristicRules: """ These rules are adapted from the paper "Identifying Business Tasks and Commitments from Email and Chat Conversations" TODO: Parsing based on POS tags. Can we use NLTK Regular Expressions for better efficiency? """ def __init__(self): self.log = LoggerUtil(self.__class__.__name__).get() self.helper = Helper() @staticmethod def match2(pos, tag1, tag2): for i, j in zip(pos, pos[1:]): if i[1] == tag1 and j[1] == tag2: return " ".join([i[0], j[0]]) return "" @staticmethod def match3(pos, tag1, tag2, tag3): for x, y, z in zip(pos, pos[1:], pos[2:]): if x[1] == tag1 and y[1] == tag2 and z[1] == tag3: return " ".join([x[0], y[0], z[0]]) return "" @staticmethod def match4(pos, tag1, tag2, tag3, tag4): for p, q, r, s in zip(pos, pos[1:], pos[2:], pos[3:]): if p[1] == tag1 and q[1] == tag2 and r[1] == tag3 and s == tag4: return " ".join([p[0], q[0], r[0], s[0]]) return "" def rule_1(self, pos): """ The negative verb indicates the presence of a cancelled action. For example, in the sentence "Please do not pass the cheque", do not is a negative verb but its an actionable sentence none the less. (PRP/PRP$) (MD) (RB) (VB/VBD/VBG/VBN/VBP/VBZ) (MD) (RB) (PRP/PRP$) (VB/VBD/VBG/VBN/VBP/VBZ) :return: """ match_list = list() match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "PRP", "MD", "RB", "VB")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "PRP", "MD", "RB", "VBD")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "PRP", "MD", "RB", "VBG")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "PRP", "MD", "RB", "VBN")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "PRP", "MD", "RB", "VBP")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "PRP", "MD", "RB", "VBZ")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "PRP$", "MD", "RB", "VB")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "PRP$", "MD", "RB", "VBD")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "PRP$", "MD", "RB", "VBG")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "PRP$", "MD", "RB", "VBN")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "PRP$", "MD", "RB", "VBP")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "PRP$", "MD", "RB", "VBZ")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "MD", "RB", "PRP", "VB")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "MD", "RB", "PRP", "VBD")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "MD", "RB", "PRP", "VBG")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "MD", "RB", "PRP", "VBN")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "MD", "RB", "PRP", "VBP")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "MD", "RB", "PRP", "VBZ")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "MD", "RB", "PRP$", "VB")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "MD", "RB", "PRP$", "VBD")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "MD", "RB", "PRP$", "VBG")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "MD", "RB", "PRP$", "VBN")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "MD", "RB", "PRP$", "VBP")))) match_list.append( self.helper.is_list_not_empty( list(self.match4(pos, "MD", "RB", "PRP$", "VBZ")))) return any(match_list) def rule_2(self, pos): """ A modal verb signals the creation of a commitment. For example, the sentence "He will handle the issuance of the LC". contains a modal verb "will" It indicates the creation of a commitment. (MD) (PRP/PRP$) (VB/VBD/VBG/VBN/VBP/VBZ) (PRP/PRP$) (MD) (VB/VBD/VBG/VBN/VBP/VBZ) :param pos: :return: """ match_list = list() match_list.append( self.helper.is_list_not_empty( list(self.match3(pos, "MD", "PRP", "VB")))) match_list.append( self.helper.is_list_not_empty( list(self.match3(pos, "MD", "PRP", "VBD")))) match_list.append( self.helper.is_list_not_empty( list(self.match3(pos, "MD", "PRP", "VBG")))) match_list.append( self.helper.is_list_not_empty( list(self.match3(pos, "MD", "PRP", "VBN")))) match_list.append( self.helper.is_list_not_empty( list(self.match3(pos, "MD", "PRP", "VBP")))) match_list.append( self.helper.is_list_not_empty( list(self.match3(pos, "MD", "PRP", "VBZ")))) match_list.append( self.helper.is_list_not_empty( list(self.match3(pos, "MD", "PRP$", "VB")))) match_list.append( self.helper.is_list_not_empty( list(self.match3(pos, "MD", "PRP$", "VBD")))) match_list.append( self.helper.is_list_not_empty( list(self.match3(pos, "MD", "PRP$", "VBG")))) match_list.append( self.helper.is_list_not_empty( list(self.match3(pos, "MD", "PRP$", "VBN")))) match_list.append( self.helper.is_list_not_empty( list(self.match3(pos, "MD", "PRP$", "VBP")))) match_list.append( self.helper.is_list_not_empty( list(self.match3(pos, "MD", "PRP$", "VBZ")))) return any(match_list) def rule_3(self, pos, text_tokens): """ The bi-gram of "please" and an action verb indicates a directive. For example, in the sentence "Please review and send along to your attorney as soon as possible", the bi-gram please review indicates a directive commitment creation. :return: """ match_list = list() match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NN", "VB")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NN", "VBD")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NN", "VBG")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NN", "VBN")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NN", "VBP")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NN", "VBZ")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNS", "VB")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNS", "VBD")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNS", "VBG")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNS", "VBN")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNS", "VBP")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNS", "VBZ")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNP", "VB")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNP", "VBD")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNP", "VBG")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNP", "VBN")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNP", "VBP")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNP", "VBZ")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNPS", "VB")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNPS", "VBD")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNPS", "VBG")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNPS", "VBN")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNPS", "VBP")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "NNPS", "VBZ")))) if 'please' in text_tokens: match_list.append(True) return any(match_list) def rule_4(self, pos): """ A verb followed by pronoun is an action statement For example, consider the sentence "Call him" :param pos: :return: """ match_list = list() match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "VB", "PRP")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "VBD", "PRP")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "VBG", "PRP")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "VBN", "PRP")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "VBP", "PRP")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "VBZ", "PRP")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "VB", "PRP$")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "VBD", "PRP$")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "VBG", "PRP$")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "VBN", "PRP$")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "VBP", "PRP$")))) match_list.append( self.helper.is_list_not_empty(list(self.match2(pos, "VBZ", "PRP$")))) return any(match_list) @staticmethod def rule_5(text_tokens): """ A question mark in a sentence indicates a directive commitment creation. :return: """ return "?" in text_tokens @staticmethod def rule_6(text_tokens): """ Common abbreviations and acronyms such as ASAP, RSVP, ETA, ETD, ET. :return: """ match_list = list() match_list.append(['asap' in text_tokens]) match_list.append(['a.s.a.p' in text_tokens]) match_list.append(['rsvp' in text_tokens]) match_list.append(['r.s.v.p' in text_tokens]) match_list.append(['eta' in text_tokens]) match_list.append(['e.t.a' in text_tokens]) match_list.append(['etd' in text_tokens]) match_list.append(['e.t.d' in text_tokens]) match_list.append(['et' in text_tokens]) match_list.append(['e.t' in text_tokens]) return any(match_list) @staticmethod def rule_7(nlp, text_tokens): """ Anything which involves completion of a task in the future is an action sentence. For examples, in the sentence "It will be posted today and the policy should be drafted by Friday". Here today and Friday are the deadlines. :return: """ match_list = list() doc = nlp(" ".join(text_tokens)) for X in doc: if X.ent_type_ == "TIME": match_list.append(True) return any(match_list)
def __init__(self): self.log = LoggerUtil(self.__class__.__name__).get() self.helper = Helper()
def __init__(self): self.log = LoggerUtil(self.__class__.__name__).get() self.config = ConfigUtil.get_config_instance() self.helper = Helper() self.metrics = Metrics()
from Models.Tensorflow import LayersMultiLayerPerceptron2_50 from Utils.Helper import Helper from sklearn.model_selection import train_test_split if __name__ == "__main__": targets = get_bcgw_targets() data = rb("data", "data_img", "data_bcgw", targets) X = data.Combined.Data() y = data.Onehot.Data() X = X.reshape(164410, 23) # split and normalize data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) mean_vals, std_val = Helper.mean_center_normalize(X_train) X_train_centered = (X_train - mean_vals) / std_val X_test_centered = (X_test - mean_vals) / std_val del X_train, X_test print("Training Samples") print(X_train_centered.shape, y_train.shape) print() print('Testing Samples') print(X_test_centered.shape, y_test.shape) print() mlpmodel = LayersMultiLayerPerceptron2_50(X_test_centered.shape[1], len(targets),