def init(self, name, removeStopWords=True, faq_gsheet=None, hash_featurez=False): super().init(name) src = 'RSNNModel.init' # these are dict( class_cat : response | question) self.faq_responses_db, self.faq_classify_phrases_db = dataSource.doGSheet_FAQ(faq_gsheet, dataSource.zGSHEET_FAQ) # self.num_classes = len( self.faq_responses_db.keys() ) self.class_categoriez = {} # force labelz into numerics TODO: refine reuse @priors for i, v in enumerate( self.faq_responses_db.keys() ) : self.class_categoriez[v] = i self.class_categoriez["I don't know yet. Will learn more"] = len(self.class_categoriez) self.num_classes = len( self.class_categoriez ) zlogger.log(src, self.showParams() ) ## feature mappers feature_col_mapper = [ tf.feature_column.embedding_column( categorical_column = tf.feature_column.categorical_column_with_hash_bucket(key='user_que', hash_bucket_size=100 ) if hash_featurez else tf.feature_column.categorical_column_with_identity('user_que'), dimension = int(self.num_classes ** 0.25) ) ] self.model = tf.estimator.DNNClassifier( feature_columns = feature_col_mapper, hidden_units = [256, 128], # hyper params?? TODO: ref and param model_dir = self.getModelFPath(), n_classes = self.num_classes, # label_vocabulary = list( self.class_categoriez.keys() ) # if defined here then labelz can be string in map_input_fn below )
def writeTo(content, dpath, dtype=zFILE, mode=MODE_WRITE): res = STREAMZ.get(dtype, doFile) zlogger.log("dataSource.writeTo", "dpath = {}".format(dpath)) res( dpath, mode=mode, content=content, )
def dump(self, fpath=None): fpath = self.model_fpath if fpath is None else fpath try: with open(fpath, "wb") as fd: pickle.dump(self.model, fd) zlogger.log("{}.model.dump".format(self.__class__), "Model saved to file successfully") except: zlogger.logError("{}.model.dump".format(self.__class__), "Pickle to File - {}".format(fpath))
def predict(self, input_text): idx = None clean_encoded_text = self.preprocessText(input_text) zlogger.log('mlp.predict', "IN: {}".format(repr(clean_encoded_text))) idx = self.model.predict(clean_encoded_text) zlogger.log('mlp.predict', "ANS: {}".format(idx)) return idx
def dumpSave(self, fpath=None): fpath = self.getModelFPath(fpath) try: with open(fpath, "wb") as fd: pickle.dump(self.persist, fd) zlogger.log("{}.model.dump".format(self.__class__), "Model saved to file successfully") except: zlogger.logError("{}.model.dump".format(self.__class__), "Pickle to File - {}".format(fpath))
def load(self, fpath=None): # 1. model definition super().load(fpath) # 2. training data fpath = "{}.dat".format(self.model_fpath if fpath is None else fpath) try: with open(fpath, "rb") as fd: self.dataset = pickle.load(fd) zlogger.log("{}.dataset.load".format(self.__class__), "Dataset loaded from file successfully") except: zlogger.logError("{}.dataset.load".format(self.__class__), "Pickle to File - {}".format(fpath))
def load(self, fpath=None): self.name = getClassName() if fpath is None else re.search( '(.*)\.zmd', fpath)[1] fpath = self.getModelFPath() if fpath is None else fpath try: with open(fpath, "rb") as fd: self.model = pickle.load(fd) zlogger.log("{}.model.load".format(self.__class__), "Model loaded from file successfully") except: zlogger.logError("{}.model.load".format(self.__class__), "Pickle to File - {}".format(fpath))
def dump(self, fpath=None): # 1. model definition super().dump(fpath) # 2. training data fpath = "{}.dat".format(self.model_fpath if fpath is None else fpath) try: with open(fpath, "wb") as fd: pickle.dump(self.dataset, fd) zlogger.log("{}.dataset.dump".format(self.__class__), "Dataset dumped to file successfully") except: zlogger.logError("{}.dataset.dump".format(self.__class__), "Pickle to File - {}".format(fpath))
def predict(self, input_text): zlogger.log('cosine.predict', "IN: {}".format(repr(input_text))) zlogger.log('cosine.predict', "IN.PREPROC: {}".format(repr(self.preprocessor))) clean_input_text = self.preprocessText(input_text) zlogger.log('cosine.predict', "IN.CLEAN: {}".format(repr(clean_input_text))) # if not isinstance(clean_input_text, str): # clean_input_text = " ".join( list( clean_input_text) ) input_vec = self.model.transform(clean_input_text) valz = cosine_similarity(input_vec, self.trained_matrix) idx = valz.argsort()[0][-1] # zlogger.log('cosine.predict', "ANS: {}".format( idx ) ) # flatz = valz.flatten() # flatz.sort() # resp = flatz[-1] resp = valz[0][idx] if resp <= self._predict_threshold: ## TODO threshold it at .5 idx = None zlogger.log('CosSimilarity.Predict', "idx = {}, resp= {}".format(idx, resp)) return idx
def splitTrainTest(clean_data, test_prop=0.2): the_data = np.array( clean_data ) zlogger.log('splitTrainTest', "Provided data size = {}\n{}".format( len(the_data), the_data[0] ) ) n_recs = len( the_data ) n_test = math.trunc( 0.2*n_recs ) # shuffle np.random.shuffle( the_data ) # split train_data, test_data = the_data[:(n_recs-n_test)], the_data[(n_recs-n_test): ] #TODO: should we flatten and when return list(train_data), list(test_data )
def predict(self, observation): src = 'RSNNModel.predict' def map_input_predict_data(): return { 'user_que': np.array([observation]) } def fetchReponse(pred): pclass = self.class_categoriez.get(pred) return self.faq_responses_db.get( pclass, "I don't seem to know about that yet. I'll find out more") pred = self.model.predict( input_fn = map_input_predict_data ) # pred = np.array( pred.get('predictions') ).argmax() zlogger.log( src, " predicted value = {} e.g. {}".format( pred, pred ) ) return fetchReponse( pred )
def loadDump(self, fpath=None): ## 1. load other objects ZModel.loadDump(self, fpath) ## 2. load keras model krs_path = self.persist['model'] zlogger.log('NgramMLP.loadDump', "Loading From: {}".format(krs_path)) self.model = keras.models.load_model(krs_path) # ## load keras model as json # with open( krs_path, 'r') as fd: # self.model = keras.models.model_from_json( fd.read() ) # # and the weights # self.model.load_weights( "{}.h5".format(krs_path) ) zlogger.log('NgramMLP.loadDump', "FIN: {}".format(self.model.summary()))
def loadDump(self, fpath=None): fpath = self.getModelFPath(fpath) def unpackPersist(): if self.persist is not None: for k, v in self.persist.items(): setattr(self, k, v) try: with open(fpath, "rb") as fd: self.persist = pickle.load(fd) zlogger.log("{}.model.load".format(self.__class__), "Model loaded from file successfully") except: zlogger.logError("{}.model.load".format(self.__class__), "Pickle to File - {}".format(fpath)) unpackPersist() zlogger.log("{}.model.load".format(self.__class__), "Persist unpacked successfully")
def dumpLoad(self, data_path=None, data_type=None): self.initz() ##TODO: reconcile self.data_path = data_path dpath = self.data_path if data_path is None else data_path dtype = self.data_type if data_type is None else data_type filez = self.getDumpLoadItems() for ext, db in filez.items(): tf = "{}.{}".format(dpath, ext) if os.path.exists( tf ): setattr( self, db, zdata_source.readFrom( tf, dtype=dtype) ) zlogger.log('zdataset.dumpLoad', "Loaded {} of size {}".format( tf, len(getattr(self, db) ) ) ) else: zlogger.log('zdataset.dumpLoad', "Not Found: {}".format( tf) ) self.data = self.clean_data self.updateXIndex() self.updateYIndex()
def runBot(isGsheetDB=False, model_type=botLogic.MODEL_TFIDF,): src = "nCoV19.runBot" zlogger.log(src, "Starting") # 1. setup bot bot = BotLogicFlow() bot.initializeModel( model_type, "{}.zmd".format(app_name) ) #2. run bot while( 1 ): user_input = input( colored("Talk to me: ", "yellow") ) prompt = colored( ">>>: ", "green") response, rcode = bot.getResponse( user_input ) if isGsheetDB and response and rcode == BotLogicFlow.RCODE_LEARNT_RESPONSE: idx = gsheet_faq_training_set_db.get( response, None) # fetch class name zlogger.log( src, idx ) response = gsheet_faq_db.get(idx, "I don't know that yet. I'll find out more") print( "{} {}\n".format(prompt, "I don't understand. Try that again" if response is None else response ) ) if ( rcode == BotLogicFlow.RCODE_EXIT_RESPONSE) : break zlogger.log(src, "Finished")
def init_app(config_obj=conf): zlogger.log( "app.init_app", f"{config_obj}" ) zlogger.log( "app_pkg.py", f": {__name__}" ) app = Flask(__name__) app.config.from_object( config_obj ) db.init_app( app ) login_manager.init_app( app ) bcrypt.init_app( app ) from <app_pkg_name>.errors.handlers import errors from <app_pkg_name>.faq.routes import faqs app.register_blueprint( errors ) app.register_blueprint( faqs ) ## TODO: disable if not in use with app.app_context(): db.create_all() db.session.commit() return app
def initiailizeBotEnv(src_path, src_type=dataSource.zFILE, model_type=botLogic.MODEL_TFIDF, nostopwords=True): global model , gsheet_faq_db, gsheet_faq_training_set_db src = "nCoV19_bot.initiailze" # 1. fetch data text list_sentz = None if src_type == dataSource.zGSHEET_FAQ: gsheet_faq_db, gsheet_faq_training_set_db = dataSource.doGSheet_FAQ(src_path, src_type) list_sentz = list( gsheet_faq_training_set_db.keys() ) else: list_sentz = dataSource.readFrom( src_path, src_type) zlogger.log(src, "Loaded data text of size {}".format(len(list_sentz) ) ) # 2. initialize and train model model = botLogic.AVAILABLE_MODELZ.get(model_type, TfidfModel) model = model() model.init( app_name, removeStopWords=nostopwords ) model.train( list_sentz ) zlogger.log(src, "Initiailized & Trained TF-IDF Model {}".format(model) ) # 3. save model model.dump()
def getResponse(self, user_input_text): response = None rcode = self.RCODE_KNOWN_RESPONSE key_words = cleanup_and_lemmatize( user_input_text ) was_que = True for word in key_words: if word in self.GREETINGZ_INPUT: response = random.choice( self.GREETINGZ_RESPONSE) was_que = False break elif word in self.THANKS_INPUT: response = random.choice( self.THANKS_RESPONSE ) was_que = False break elif word in self.EXIT_INPUT: response = random.choice( self.THANKS_RESPONSE )+". "+random.choice( self.EXIT_RESPONSE ) rcode = self.RCODE_EXIT_RESPONSE return response, rcode if was_que: pred_cat = self.model.predict( user_input_text ) zlogger.log("bot.Predicted", "IN = {}".format( repr(pred_cat ) ) ) if isinstance( pred_cat, list): pred_cat = pred_cat[0] pred_cat, response = self.dset.getPredictedAtIndex( pred_cat ) zlogger.log("bot.Predicted", "Class = {}".format( repr(pred_cat ) ) ) if isinstance( response, list): response, response_src, response_link, *_ = response rcode = self.RCODE_LEARNT_RESPONSE return response, rcode
def predict(self, observation): raise NotImplementedError ''' Input: train_x : a list of sentences ''' def train(self, train_x, train_y=None, test_x=None, test_y=None): raise NotImplementedError if __name__ == "__main__": src = 'rnnModel.main' dpath = [ ('1EuvcPe9WXSQTsmSqhq0LWJG4xz2ZRQ1FEdnQ_LQ-_Ks', 'FAQ responses!A1:G1000'), ('1EuvcPe9WXSQTsmSqhq0LWJG4xz2ZRQ1FEdnQ_LQ-_Ks', 'Classify_Phrases!A1:G1000')] zlogger.log(src, 'STARTING') rnn = RetrievalSupervisedNNModel() rnn.init('DNNClassifier', faq_gsheet= dpath, hash_featurez=True ) zlogger.log(src, "Model is:\n{}".format( rnn ) ) #train rnn.train() zlogger.log(src, 'Done Training. Moving on to predict using trained model') #predict sentz = [ "Is my cat sick", "Is my cat sick with the virus", "Can an insect infect me", "What is corana", "What is corana virus", "What is covid-19"] for s in sentz: r = rnn.predict( s ) zlogger.log("{}.predict".format(src), "\n{} ==> {} ".format( s, r ) )
def train(self, train_x=None, train_y=None, test_x=None, test_y=None, epochs=25000): self.model.train(input_fn=self.map_input_train_data , steps = epochs) zlogger.log('RSNNModel.train', "FINISHED: {} epochs".format(epochs) )
else: # return "{}\n\t{}".format(sent_tokenz[ idx ] , sent_tokenz[ idx+1]) return "{}".format(sent_tokenz[idx]) ''' Input: train_x : a list of sentences ''' def train(self, train_x, train_y=None, test_x=None, test_y=None): self.dataset = train_x if isinstance( train_x, list) else nltk.sent_tokenize(train_x) if __name__ == "__main__": zlogger.log("tfidfModel.main", "Starting") src = "tfidfModel.main.test" named = "TFIDF_ChatBot" st = "The quick brown fox jumped over the lazy dogs. This is an account of a lost dog. His name was Jazzy and he had 7 bones. Hey there! Okay, bye." for ist in [True, False]: wt = "Without" if ist else "With" zlogger.log(src, "\n\n{0} {1} Stop Words {0}".format("-" * 7, wt)) m = TfidfModel() m.init(named, removeStopWords=ist) m.train(st) zlogger.log(src, "Data: {}\nModel: {}\n".format(st, m))
if v in ln.lower(): tmp[i] = 1 vect.append(tmp) return vocab, np.array(vect) ''' Input: dataset to be operated on Output: dataset after operation ''' if __name__ == "__main__": st = "The quick brown fox jumped over the lazy dogs. This is an account of a lost dog. His name was Jazzy and he had 7 bones. Hey there! Okay, bye." zlogger.log("dataSet.main", "Starting") src = "dataSet.main.example" df = initiateDataset(st) zlogger.log(src, "Dataset of {} lines".format(len(df))) print(df, "\n") tokenz = wordTokenizeWithoutPunctuations(df) vocab = getVocabList(df) zlogger.log( src, "There are {} words and {} vocab".format(len(tokenz), len(vocab))) print("{}\n{}\n".format(tokenz, vocab)) vocab, matrix = oneHotEncode_LemmaBagOfWords(st) zlogger.log(src, "Vocab = {} Matrix = {}".format(len(vocab), matrix.shape))
} return { **tmp , **tmp2} def getPredictedAtIndex(self, y_index): if y_index is None: return None, None # zlogger.log( 'zdataset.get_y-at', "IN: {}".format(y_index ) ) class_cat = self.y_labelz[ y_index ] # zlogger.log( 'zdataset.get_y-at', "CAT: {}".format( class_cat ) ) return class_cat, self.faq_db.get( class_cat , None ) ########################################################### if __name__ == "__main__": src = "dataset.main" zlogger.log(src, ">>>>> STARTING\n") st = "The quick brown fox jumped over the lazy dogs. This is an account of a lost dog. His name was Jazzy and he had 7 bones. Hey there! Okay, bye." # st = nltk.sent_tokenize( st ) ds = ["The quick brown fox", "He had 7 bones"] ps = "The brown bones" # predict text tokz = lemmatizeTokens(st) print( "Tokens len: {}\n{}\n".format( len(tokz), tokz) ) dset = ZDataset() dset.initFromSeq( ds ) dset.preprocess()
Return: ''' def writeTo(content, dpath, dtype=zFILE, mode=MODE_WRITE): res = STREAMZ.get(dtype, doFile) zlogger.log("dataSource.writeTo", "dpath = {}".format(dpath)) res( dpath, mode=mode, content=content, ) if __name__ == "__main__": zlogger.log("dataSource.main", "Starting") arange = 'FAQ responses!A1:G1000' gsheet_id = '1EuvcPe9WXSQTsmSqhq0LWJG4xz2ZRQ1FEdnQ_LQ-_Ks' #covid_19_faq # gsheet_id = 'covid_19_faq' etype = ['Text File', 'PDF', 'Article', 'Site', 'Serialized', 'GSheet'] etype_i = [zFILE, zPDF, zARTICLE, zNESTED_ARTICLES, zSERIALIZED, zGSHEET] epath = [ 'example.txt', 'example.pdf', 'https://www.nation.co.ke/counties/nairobi/Police-kill-ATM-heist-mastermind/1954174-5503356-aodphx/index.html', 'https://www.standardmedia.co.ke/corporate/news', 'example.byt', (gsheet_id, arange) ] econtent = [ 'The quick brown fox jumper over the lazy dogs.' * 7, None, None, None,
elif word in self.EXIT_INPUT: response = random.choice( self.THANKS_RESPONSE) + ". " + random.choice( self.EXIT_RESPONSE) rcode = self.RCODE_EXIT_RESPONSE return response, rcode if was_que: response = self.model.predict(user_input_text) rcode = self.RCODE_LEARNT_RESPONSE return response, rcode if __name__ == "__main__": zlogger.log("botLogic.main", "Starting") bot = BotLogicFlow() bot.initializeModel(BotLogicFlow.MODEL_TFIDF, "TFIDF_ChatBot.zmd") while (1): user_input = input(colored("Talk to me: ", "yellow")) prompt = colored(">>>: ", "green") response, rcode = bot.getResponse(user_input) print("{} {}\n".format( prompt, "I don't understand. Try that again" if response is None else response)) if (rcode == -99):
import <app_pkg_name> from <app_pkg_name>.bin import zlogger app = <app_pkg_name>.init_app() zlogger.startLogger("<app_pkg_name>") if __name__ == "__main__": ''' TODO: populate dummy data, setup zlogger ''' zlogger.log( "run.py" f"starting {__name__}" ) app.run(debug=True)