def scrape_url( self, url, parser='html.parser', tag_to_find='p', ): try: sents = [] resp = requests.get(url=url, ) soup = BeautifulSoup(resp.content, parser) contents_tag = soup.find_all(tag_to_find) for cont in contents_tag: txt = StringUtils.trim(cont.get_text()) sent_list = txt.split('。') sent_list = [StringUtils.trim(s) for s in sent_list if s] if len(sent_list): sents += sent_list Log.debug('Split "' + str(txt) + '" into:' + str(sent_list)) # [Log.debug('\t"' + str(s) + '"') for s in sent_list] return sents except Exception as ex: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Error scraping url "' + str(url) + '", exception: ' + str(ex))
def verify_totp_style( self, # We test for <tolerance_secs> back tolerance_secs=30): now = datetime.now() try: for i in range(tolerance_secs): t_test = now - timedelta(seconds=i) Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Trying ' + str(t_test.strftime('%Y-%m-%d %H:%M:%S'))) test_challenge_calc = AccessTokenSharedsecretChallenge.create_totp_style_challenge_response( shared_secret=self.shared_secret, datetime_val=t_test, algo_hash=self.algo_hash) res = self.__compare_test_challenge( test_challenge_calc=test_challenge_calc) if res == True: return res return False except Exception as ex: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Exception for shared secret "' + str(self.shared_secret) + '", totp style test challenge "' + str(self.test_challenge) + '": ' + str(ex)) return False
def __pre_process_training_data( self ): if not self.is_training_data_ready: try: self.training_data = self.training_data_source.fetch_data() except Exception as ex: errmsg = \ str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Exception calling external object type "' + str(type(self.training_data_source)) \ + '" method fetch_data(), exception msg: ' + str(ex) Log.error(errmsg) raise Exception(errmsg) if type(self.training_data) is not tdm.TrainingDataModel: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': "' + str(self.identifier_string) + '": Wrong training data type "' + str(type(self.training_data)) + '".' ) # Train a single y/label ID only, regardless of train mode if self.y_id is not None: # Filter by this y/label only self.training_data.filter_by_y_id( y_id = self.y_id ) return
def send(self, user, password, recipients_list, message): try: if password not in [None, '']: self.server.login(user=user, password=password) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Login for user "' + str(user) + '" successful.') else: # If no password passed in, no need to do login Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Not doing login for user "' + str(user) + '", no password given "' + str(password) + '"') self.server.sendmail(from_addr=user, to_addrs=recipients_list, msg=message) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Message from ' + str(user) + ' to ' + str(recipients_list) + ' sent successfully. Closing server..') self.server.close() Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Mail server "' + str(self.mail_server_url) + '" closed') except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Exception sending mail from ' + str(user) + ' to ' + str(recipients_list)\ + '. Got exception ' + str(ex) + '.' Log.error(errmsg) raise Exception(errmsg)
def decode(self, ciphertext): try: if self.cipher_mode == AES.MODE_EAX: cipher = AES.new(key=self.key, mode=self.cipher_mode, nonce=self.nonce) cipherbytes = b64decode(ciphertext.encode(self.text_encoding)) data = cipher.decrypt(cipherbytes) elif self.cipher_mode == AES.MODE_CBC: cipher = AES.new(key=self.key, mode=self.cipher_mode, iv=self.nonce) cipherbytes = b64decode(ciphertext.encode(self.text_encoding)) data = cipher.decrypt(cipherbytes) Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Decrypted data length = ' + str(len(data)) + ', modulo 16 = ' + str(len(data) % 128 / 8)) # Remove last x bytes encoded in the padded bytes data = data[:-data[-1]] else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unsupported mode "' + str(self.cipher_mode) + '".') return str(data, encoding=STR_ENCODING) except Exception as ex: errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Error decoding data "' + str(ciphertext) + '" using AES ". Exception: ' + str(ex) Log.error(errmsg) raise Exception(errmsg)
def prepare_message(from_addr, to_addrs_list, subject, text, files=None): try: msg = MIMEMultipart() msg['From'] = from_addr msg['To'] = SendMail.COMMASPACE.join(to_addrs_list) msg['Date'] = formatdate(localtime=True) msg['Subject'] = subject msg.attach(MIMEText(text)) files_allowed = SendMail.__attach_file_check_validity_and_size( files_attachment_list=files, max_total_files_size=SendMail. MAX_TOTAL_FILES_SIZE_MB_EMAIL_ATTCH) for f in files_allowed or []: with open(f, "rb") as fil: part = MIMEApplication(fil.read(), Name=os.path.basename(f)) # After the file is closed part[ 'Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename( f) msg.attach(part) return msg.as_string() except Exception as ex: errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Error creating email message: ' + str(ex) Log.error(errmsg) raise Exception(errmsg)
def get_model_file_prefix(dir_path_model, model_name, identifier_string, is_partial_training): # Prefix or dir prefix_or_dir = dir_path_model + '/' + model_name + '.' + identifier_string if is_partial_training: # Check if directory exists if not os.path.isdir(prefix_or_dir): Log.important( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Path "' + str(prefix_or_dir) + '" does not exist. Trying to create this directory...') try: os.mkdir(path=prefix_or_dir) Log.important( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Path "' + str(prefix_or_dir) + '" successfully created.') except Exception as ex: errmsg =\ str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Error creating directory "' + str(prefix_or_dir) + '". Exception ' + str(ex) + '.' Log.error(errmsg) raise Exception(errmsg) return prefix_or_dir else: Log.important( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Using path prefix "' + str(prefix_or_dir) + '"') return prefix_or_dir
def read_text_file( filepath, encoding='utf-8', throw_exception=False, ): try: fh = open(filepath, 'r', encoding=encoding) except IOError as e: errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Cannot open file [' + str(filepath) + ']. ' + str(e) Log.error(errmsg) if throw_exception: raise Exception(errmsg) else: return [] lines = [] for line in fh: # Can just use StringUtils.trim() to remove newline also # if remove_newline: # line = re.sub('\n|\r', '', line) # line = unicode(line, encoding) lines.append(line) fh.close() return lines
def process_common_words(self, word_split_token=' '): try: self.raw_words = StringUtils.trim(self.raw_words) self.raw_words = re.sub(pattern='[\xa0\t\n\r]', repl=word_split_token, string=self.raw_words) self.raw_words = self.raw_words.lower() except Exception as ex: errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Error processing raw words. Exception: ' + str(ex) Log.error(errmsg) raise Exception(errmsg) try: self.common_words = self.raw_words.split(word_split_token) # Remove None, '', {}, etc. self.common_words = [w for w in self.common_words if w] word_stems = self.add_word_stems() if word_stems: self.common_words = word_stems + self.common_words self.common_words = sorted(set(self.common_words)) Log.info( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Loaded ' + str(len(self.common_words)) + ' common words of lang "' + str(self.lang) + '".' ) except Exception as ex: errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Error processing common words. Exception: ' + str(ex) Log.error(errmsg) raise Exception(errmsg) return
def import_form_fields( list_json, mex_form_model ): if len(list_json) != len(mex_form_model): raise Exception( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': List of fields must be same length with mex expr list.' + ' Fields: ' + str(list_json) + ', Mex Expr List: ' + str(mex_form_model) ) form_fields = [] for i in range(len(list_json)): json_field = list_json[i] json_field[ffld.FormField.KEY_MEX_EXPR] = StringUtils.trim(mex_form_model[i]) try: form_fields.append( ffld.FormField.import_form_field(json_obj=json_field) ) except Exception as ex_field: errmsg = \ str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Error importing field: ' + str(json_field) \ + '. Exception: ' + str(ex_field) Log.error(errmsg) raise Exception(errmsg) return form_fields
def __recognize_file(self): need_convert_format = re.sub(pattern='(.*[.])([a-zA-Z0-9]+$)', repl='\\2', string=self.audio_file).lower() != 'wav' audio_filepath_wav = self.audio_file if need_convert_format: Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Converting "' + str(self.audio_file) + '" to wav format..') audio_filepath_wav = AudioUtils().convert_format( filepath=self.audio_file) # Initialize recognizer class (for recognizing the speech) r = sr.Recognizer() # Reading Audio file as source # listening the audio file and store in audio_text variable with sr.AudioFile(audio_filepath_wav) as source: audio_text = r.listen(source) # recoginize_() method will throw a request error if the API is unreachable, hence using exception handling try: if self.engine == SpeechRecognition.ENGINE_GOOGLE: text = r.recognize_google(audio_text, language=self.lang) elif self.engine == SpeechRecognition.ENGINE_GOOGLE_CLOUD: text = r.recognize_google_cloud( audio_text, credentials_json=self.auth_info, language=self.lang) elif self.engine == SpeechRecognition.ENGINE_BING: text = r.recognize_bing(audio_text, key=self.auth_info, language=self.lang) else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unsuported engine "' + str(self.engine) + '".') Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Converting audio transcripts into text ...') Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Recognized "' + str(self.lang) + '" text "' + str(text) + '" from audio file "' + str(self.audio_file) + '"') return text except Exception as ex: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Exception converting audio transcript from "' + str(self.audio_file) + '": ' + str(ex))
def convert_to_simplified_chinese(self, text): try: text_sim = hzc.HanziConv.toSimplified(text) return text_sim except Exception as ex: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Error converting "' + str(text) + '" to simplified chinese. Exception ' + str(ex) + '.' ) return text
def preprocess_training_data(self): if not self.is_training_data_ready: try: # # The external interface must pass back 2 parameters, a DataFrame of preprocessed training data # and Embedding Layer params # self.df_training_data_pp, self.embedding_params = self.training_data_source.fetch_and_preprocess_data( ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Successfully preprocessed training data. Max label val = ' + str(self.embedding_params.max_label_val) + ', max sentence length = ' + str(self.embedding_params.max_sent_len) + ', vocabulary size = ' + str(self.embedding_params.vocab_size) + ', x one hot dict: ' + str(self.embedding_params.x_one_hot_dict)) self.training_data = TextTrainer.convert_preprocessed_text_to_training_data_model( model_name=self.model_name, training_dataframe=self.df_training_data_pp, embedding_x=self.embedding_params.x, embedding_y=self.embedding_params.y, embedding_x_one_hot_dict=self.embedding_params. x_one_hot_dict, embedding_y_one_hot_dict=self.embedding_params. y_one_hot_dict, word_freq_model=self.word_freq_model, ) except Exception as ex: errmsg = \ str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Exception calling external object type "' + str(type(self.training_data_source)) \ + '" method fetch_and_preprocess_data(), exception msg: ' + str(ex) Log.error(errmsg) raise Exception(errmsg) if type(self.training_data) is not tdm.TrainingDataModel: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': "' + str(self.identifier_string) + '": Wrong training data type "' + str(type(self.training_data)) + '".') # Train a single y/label ID only, regardless of train mode if self.y_id is not None: # Filter by this y/label only self.training_data.filter_by_y_id(y_id=self.y_id) return
def __init__(self, noun_case_endings=NOUN_PARTICLES, verb_case_endings=()): super().__init__(noun_case_endings=noun_case_endings, verb_case_endings=verb_case_endings) try: # Разбить Хангул (한글) слоги на буквы (자모) # https://github.com/JDongian/python-jamo, https://python-jamo.readthedocs.io/en/latest/ from jamo import h2j, j2hcj except Exception as ex: errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Error importing jamo library: ' + str(ex) Log.error(errmsg) raise Exception(errmsg) return
def __send_email(self, text_subject, text_msg, files, ignore_limit): email_msg = SendMail.prepare_message( from_addr=self.from_addr, to_addrs_list=self.alert_recipients, subject=text_subject, text=text_msg, files=files) try: # Check how many already sent this hour if datetime.now().hour != self.current_hour: self.current_hour = datetime.now().hour self.emails_sent_this_hour = 0 if not ignore_limit: if self.emails_sent_this_hour >= self.limit_per_hour: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Send email alert limit ' + str(self.limit_per_hour) + ' per hour hit. Not sending subject: "' + str(text_subject) + '", message: ' + str(text_msg)) return else: Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Ignoring send limit of ' + str(self.limit_per_hour) + ' per hour.') if self.fake_send: print('Fake send email from "' + str(self.from_addr) + '" to: ' + str(self.alert_recipients) + ' Message:\n\r' + str(email_msg)) else: SendMail(mode=self.mail_mode, mail_server_url=self.mail_server_url, mail_server_port=self.mail_server_port).send( user=self.from_addr, password=self.password, recipients_list=self.alert_recipients, message=email_msg) self.emails_sent_this_hour += 1 except Exception as ex_mail: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Error sending email: ' + str(ex_mail) + '. Could not send message: ' + str(email_msg))
def encode( self, # bytes format data): try: if self.cipher_mode == AES.MODE_EAX: cipher = AES.new(key=self.key, mode=self.cipher_mode, nonce=self.nonce) cipherbytes, tag = cipher.encrypt_and_digest(data) return AES_Encrypt.EncryptRetClass( cipher_mode=self.cipher_mode_str, ciphertext_b64=b64encode(cipherbytes).decode( self.text_encoding), plaintext_b64=None, tag_b64=b64encode(tag).decode(self.text_encoding), nonce_b64=b64encode(self.nonce).decode(self.text_encoding)) elif self.cipher_mode == AES.MODE_CBC: # 1-16, make sure not 0, other wise last byte will not be block length length = AES_Encrypt.DEFAULT_BLOCK_SIZE_AES_CBC - ( len(data) % AES_Encrypt.DEFAULT_BLOCK_SIZE_AES_CBC) # Pad data with the original length, so when we decrypt we can just take data[-1] # as length of data block data += bytes(chr(length), encoding=STR_ENCODING) * length Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Padded length = ' + str(length)) cipher = AES.new(key=self.key, mode=self.cipher_mode, iv=self.nonce) cipherbytes = cipher.encrypt(data) return AES_Encrypt.EncryptRetClass( cipher_mode=self.cipher_mode_str, ciphertext_b64=b64encode(cipherbytes).decode( self.text_encoding), plaintext_b64=None, tag_b64=None, nonce_b64=b64encode(self.nonce).decode(self.text_encoding)) else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unsupported mode "' + str(self.cipher_mode) + '".') except Exception as ex: errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Error encoding data "' + str(data) + '" using AES ". Exception: ' + str(ex) Log.error(errmsg) raise Exception(errmsg)
def verify(self): try: test_challenge_calc = AccessTokenSharedsecretChallenge.create_test_challenge_string( shared_secret=self.shared_secret, challenge_string=self.challenge, algo_hash=self.algo_hash) return self.__compare_test_challenge( test_challenge_calc=test_challenge_calc) except Exception as ex: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Exception for shared secret "' + str(self.shared_secret) + '", challenge "' + str(self.challenge) + '": ' + str(ex)) return False
def stop_model_thread(self): # Kill any background jobs try: Log.info( str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ': "' + str(self.identifier_string) + '" Stopping model background job..') self.model.stoprequest.set() except Exception as ex: Log.error( str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ': "' + str(self.identifier_string) + '" Stop model background job exception: ' + str(ex))
def verify_totp_otp(self, valid_window=1): try: import pyotp s = str(self.shared_secret) # Pad to 8 modulo with last character in shared secret shared_secret_pad = s + s[-1] * ((8 - len(s) % 8) % 8) totp_obj = pyotp.TOTP(shared_secret_pad) res = totp_obj.verify(otp=self.test_challenge, valid_window=valid_window) # print('Secret=' + str(self.shared_secret) + ', otp=' + str(self.test_challenge) + ' ' + str(res)) return res except Exception as ex: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Error TOTP authentication, exception: ' + str(ex)) return False
def segment_ko_ja( self, text, return_array_of_split_words = False ): try: if self.lang in [lf.LangFeatures.LANG_JA]: words_postags = nagisa.tagging(text) txt_sym_tok = words_postags.words txt_sym_postags = words_postags.postags Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Japanese segmentation "' + str(txt_sym_tok) + '", word & POS tags: ' + str(words_postags) ) if return_array_of_split_words: return txt_sym_tok else: return BasicPreprocessor.get_word_separator(lang=self.lang).join(txt_sym_tok) elif self.lang in [lf.LangFeatures.LANG_KO]: self.warn_korean() words_postags = self.kkma.pos( phrase = text ) txt_sym_tok = [wp[0] for wp in words_postags] txt_sym_postags = [wp[1] for wp in words_postags] Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Korean segmentation "' + str(txt_sym_tok) + '", word & POS tags: ' + str(words_postags) ) if return_array_of_split_words: return txt_sym_tok else: return BasicPreprocessor.get_word_separator(lang=self.lang).join(txt_sym_tok) else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': No external library supported for language "' + str(self.lang) + '"' ) except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Error segmenting lang "' + str(self.lang) + '", text "' + str(text) \ + '", exception: ' + str(ex) Log.error(errmsg) raise Exception(errmsg)
def __init__(self, lang=LangFeatures.LANG_EN): self.lang = LangFeatures.map_to_lang_code_iso639_1(lang_code=lang) Ssl.disable_ssl_check() try: if nltk.download(Corpora.NLTK_COMTRANS): Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': NLTK download of "' + Corpora.NLTK_COMTRANS + '" OK.') else: raise Exception('Download "' + str(Corpora.NLTK_COMTRANS) + '" returned False') except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': NLTK download of "' + str(Corpora.NLTK_COMTRANS) + '" exception: ' \ + str(ex) + '.' Log.error(errmsg) raise Exception(errmsg) return
def __attach_file_check_validity_and_size( files_attachment_list, max_total_files_size=MAX_TOTAL_FILES_SIZE_MB_EMAIL_ATTCH): if files_attachment_list is None: return [] files_attachment_list_allowed = [] cum_size_mb = 0.0 for filepath in files_attachment_list: if os.path.isfile(filepath): Log.info('File <' + str(__name__) + '> line ' + str(getframeinfo(currentframe()).lineno) + ': Attachment file path "' + str(filepath) + '" OK') else: Log.error('File <' + str(__name__) + '> line ' + str(getframeinfo(currentframe()).lineno) + ': Invalid attachment file "' + str(filepath) + '", not attaching to email') continue fsize_bytes = os.path.getsize(filepath) fsize_mb = round(fsize_bytes / (1024 * 1024), 2) if fsize_mb + cum_size_mb < max_total_files_size: files_attachment_list_allowed.append(filepath) cum_size_mb += fsize_mb Log.info('File <' + str(__name__) + '> line ' + str(getframeinfo(currentframe()).lineno) + ': Appended file "' + str(filepath) + '" as email attachment size ' + str(fsize_mb) + 'MB, total cumulative ' + str(cum_size_mb) + 'MB') else: Log.warning('File <' + str(__name__) + '> line ' + str(getframeinfo(currentframe()).lineno) + ': File "' + str(filepath) + '" too big ' + str(fsize_mb) + 'MB. Cumulative = ' + str(fsize_mb + cum_size_mb) + ' Not attaching to email') return files_attachment_list_allowed
def __recognize_mic(self): # Initialize recognizer class (for recognizing the speech) r = sr.Recognizer() with sr.Microphone() as source: print('Start talking') audio_text = r.listen(source) print('Done') try: # using google speech recognition text = r.recognize_google(audio_text, language=self.lang) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Recognized "' + str(self.lang) + '" text "' + str(text) + '" from mic "' + str(self.audio_file) + '"') return text except Exception as ex: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Exception: ' + str(ex))
def hash(string, algo=ALGO_SHA1): str_encode = string.encode(encoding=Hash.STR_ENCODING) try: if algo == Hash.ALGO_SHA1: h = hashlib.sha1(str_encode) elif algo == Hash.ALGO_SHA256: h = hashlib.sha256(str_encode) elif algo == Hash.ALGO_SHA512: h = hashlib.sha512(str_encode) elif algo == Hash.ALGO_SHA3_256: h = hashlib.sha3_256(str_encode) elif algo == Hash.ALGO_SHA3_512: h = hashlib.sha3_512(str_encode) else: raise Exception('Unsupported hash algo "' + str(algo) + '".') return h.hexdigest() except Exception as ex: errmsg = str(__name__) + ' ' + str() \ + 'Error hashing string "' + str(string) + '" using algo "' + str(algo)\ + '". Exception: ' + str(ex) Log.error(errmsg) return None
def retrieve_corpora(self, corpora_name): try: als = comtrans.aligned_sents(corpora_name) except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Comtrans download of corpora "' + str(corpora_name) + '" exception: ' \ + str(ex) + '.' Log.error(errmsg) raise Exception(errmsg) sentences_l1 = [sent.words for sent in als] sentences_l2 = [sent.mots for sent in als] Log.info('Sentences length = ' + str(len(sentences_l1))) # Filter length (sentences_l1, sentences_l2) = self.filter_pair_sentence_length( sentences_arr_l1=sentences_l1, sentences_arr_l2=sentences_l2, max_len=20) Log.info('Sentences length after filtering = ' + str(len(sentences_l1))) assert len(sentences_l1) == len(sentences_l2) return (sentences_l1, sentences_l2)
def convert_datetime_to_number( x, datetime_format, round_to_integer = False, # If relative date is given, total days from this date is returned relative_date = None ): try: if type(x) is str: dtime = datetime.strptime(str(x), datetime_format) else: dtime = x dtime_no = DataPreprocessor.date_to_number( x = dtime, round_to_integer = round_to_integer, relative_date = relative_date ) return dtime_no except Exception as ex: Log.error( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Error converting "' + str(x) + '" type "' + str(type(x)) + '": ' + str(ex) ) return 0
def persist_model_to_storage( self, network = None ): try: # # Stupid Keras changed behavior in Python 3.8 and need to be a directory instead # major_version = sys.version_info[0] minor_version = sys.version_info[1] if (major_version > 3) or ( (major_version == 3) and (minor_version >= 8) ): if not os.path.isdir(self.fpath_model): Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Path "' + str(self.fpath_model) + '" does not exist. Trying to create this directory...' ) os.mkdir(path=self.fpath_model) self.network.save(self.fpath_model) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Saved network to file/directory "' + str(self.fpath_model) + '".' ) if self.x_one_hot_dict: ModelInterface.safe_dataframe_write( df = pd.DataFrame({ 'code': list(self.x_one_hot_dict.keys()), 'word': list(self.x_one_hot_dict.values()), }), name_df = 'x_one_hot_dict', include_index = True, index_label = 'INDEX', filepath = self.fpath_model_x_one_hot, log_training = self.logs_training ) if self.y_one_hot_dict: ModelInterface.safe_dataframe_write( df = pd.DataFrame({ 'code': list(self.y_one_hot_dict.keys()), 'label': list(self.y_one_hot_dict.values()), }), name_df = 'y_one_hot_dict', include_index = True, index_label = 'INDEX', filepath = self.fpath_model_y_one_hot, log_training = self.logs_training ) # To allow applications to check if model updated # It is important to do it last (and fast), after everything is done ModelInterface.safe_file_write( dict_obj = {'timenow': str(datetime.now())}, name_dict_obj = 'model last updated time', filepath = self.fpath_updated_file, write_as_json = False, log_training = self.logs_training ) return except Exception as ex_save: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Error saving model "' + str(self.identifier_string) + '": ' + str(ex_save) Log.error(errmsg) raise Exception(errmsg)
def load_model_parameters( self ): try: self.mutex_training.acquire() # First check the existence of the files if not os.path.isfile(self.fpath_updated_file): errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Last update file "' + self.fpath_updated_file \ + 'for model "' + str(self.identifier_string) + '" not found!' Log.error(errmsg) raise Exception(errmsg) self.network = load_model(self.fpath_model) try: df_x_one_hot_dict = pd.read_csv( filepath_or_buffer = self.fpath_model_x_one_hot, sep = ',', index_col = 'INDEX' ) self.x_one_hot_dict = {code:word for code,word in df_x_one_hot_dict.values} # Form the inverse for convenience of transforming user input self.x_one_hot_dict_inverse = {word: code for code, word in self.x_one_hot_dict.items()} Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Model "' + str(self.identifier_string) + '" x one hot dict loaded: ' + str(self.x_one_hot_dict) ) except Exception as ex_x_one_hot_dict: Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Model "' + str(self.identifier_string) + '" no x_one_hot_dict. ' + str(ex_x_one_hot_dict) ) self.x_one_hot_dict = None try: df_y_one_hot_dict = pd.read_csv( filepath_or_buffer = self.fpath_model_y_one_hot, sep = ',', index_col = 'INDEX' ) self.y_one_hot_dict = {code:lbl for code,lbl in df_y_one_hot_dict.values} Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Model "' + str(self.identifier_string) + '" y one hot dict loaded: ' + str(self.y_one_hot_dict) ) except Exception as ex_y_one_hot_dict: Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Model "' + str(self.identifier_string) + '" no y_one_hot_dict. ' + str(ex_y_one_hot_dict) ) self.y_one_hot_dict = None self.model_loaded = True self.model_updated_time = os.path.getmtime(self.fpath_updated_file) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Model "' + str(self.identifier_string) + '" trained at "' + str(self.model_updated_time) + '" successfully loaded.' ) except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Model "' + str(self.identifier_string)\ + '" failed to load from file "' + str(self.fpath_model)\ + '". Got exception ' + str(ex) + '.' Log.error(errmsg) raise Exception(errmsg) finally: self.mutex_training.release()
def train( self, write_model_to_storage = True, write_training_data_to_storage = False, # Option to train a single y ID/label y_id = None, # To keep training logs here for caller's reference log_list_to_populate = None, # # Transform train labels to categorical or not # convert_train_labels_to_categorical = True ): if self.training_data is None: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Cannot train without training data for identifier "' + self.identifier_string + '"' ) if type(self.model_params) is not nwdesign.NetworkDesign: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Cannot train without network for identifier "' + self.identifier_string + '". Got wrong type "' + str(type(self.model_params)) ) self.mutex_training.acquire() try: self.model_loaded = False Log.info( str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ': Training for data, x shape ' + str(self.training_data.get_x().shape) + ', train labels with shape ' + str(self.training_data.get_y().shape) ) if type(log_list_to_populate) is list: self.logs_training = log_list_to_populate else: self.logs_training = [] x = self.training_data.get_x().copy() y = self.training_data.get_y().copy() self.x_one_hot_dict = self.training_data.get_x_one_hot_dict() # Form the inverse for convenience of transforming user input if type(self.x_one_hot_dict) is dict: self.x_one_hot_dict_inverse = {word:code for code,word in self.x_one_hot_dict.items()} self.y_one_hot_dict = self.training_data.get_y_one_hot_dict() # Convert labels to categorical one-hot encoding train_labels_categorical = to_categorical(y) n_labels = len(list(set(y.tolist()))) Log.info( str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ': Total unique labels = ' + str(n_labels) + '.', log_list = self.logs_training ) try: self.network_layer_config = self.model_params.get_network_config() Log.info( str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ': Start creating network layers from config: ' + str(self.network_layer_config) ) network = self.model_params.get_network() Log.info( str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ': Successfully created network layers from config: ' + str(self.network_layer_config) ) except Exception as ex_layers: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Error creating network layers for config: ' + str(self.network_layer_config) \ +'. Exception: ' + str(ex_layers) Log.error( s = errmsg, log_list = self.logs_training ) raise Exception(errmsg) try: Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Start compiling network "' + str(self.identifier_string) + '"..' ) network.compile( optimizer = self.train_optimizer, loss = self.train_loss, metrics = self.evaluate_metrics ) except Exception as ex_compile: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Error compiling network for config: ' + str(self.network_layer_config) \ +'. Exception: ' + str(ex_compile) Log.error(errmsg) raise Exception(errmsg) # Log model summary network.summary(print_fn=Log.info) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + 'Categorical Train label shape "' + str(train_labels_categorical.shape) + '":\n\r' + str(train_labels_categorical) ) try: Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Start fitting network "' + str(self.identifier_string) + '"..' ) # print('***** x: ' + str(x)) # print('***** y: ' + str(train_labels_categorical)) train_labels = y if self.model_params.require_label_to_categorical: train_labels = train_labels_categorical if self.train_batch_size is not None: network.fit( x, train_labels, epochs = self.train_epochs, batch_size = self.train_batch_size ) else: network.fit( x, train_labels, epochs = self.train_epochs, ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Successfully fitted network "' + str(self.identifier_string) + '"..' ) except Exception as ex_fit: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Error training/fitting network for config: ' + str(self.network_layer_config) \ +'. Exception: ' + str(ex_fit) Log.error(errmsg) raise Exception(errmsg) self.network = network if write_model_to_storage: self.persist_model_to_storage(network=network) if write_training_data_to_storage: self.persist_training_data_to_storage(td=self.training_data) self.model_loaded = True except Exception as ex_train: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Train error for identifier "' + str(self.identifier_string)\ + '". Exception: ' + str(ex_train) Log.error( s = errmsg, log_list = self.logs_training ) raise Exception(errmsg) finally: self.mutex_training.release() return
def safe_file_write(dict_obj, filepath, name_dict_obj=None, write_as_json=False, log_training=None, file_encoding='utf-8'): DEFAULT_CSV_SEPARATOR = ',' # # Write to tmp file first # filepath_tmp = str(filepath) + '.tmp' # We backup the previous model file just in case filepath_old = ModelInterface.get_backup_filepath(filepath=filepath) try: f = open(file=filepath_tmp, mode='w', encoding=file_encoding) if write_as_json: json.dump(dict_obj, f, indent=2) else: for i in dict_obj.keys(): line = str(dict_obj[i]) f.write(str(line) + '\n\r') f.close() Log.important(str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': TMP File: Saved "' + str(name_dict_obj) + '" with ' + str(len(dict_obj.keys())) + ' lines,' + ' filepath "' + str(filepath_tmp) + '"', log_list=log_training) except Exception as ex: errmsg =\ str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': TMP File: Could not create tmp "' + str(name_dict_obj)\ + '" file "' + str(filepath_tmp) + '". ' + str(ex) Log.error(s=errmsg, log_list=log_training) raise Exception(errmsg) # # TODO Now try to read it back # time.sleep(0.2) # # Finally rename the .tmp file # try: # If old model file exists, backup the file if os.path.isfile(filepath): os.rename(src=filepath, dst=filepath_old) Log.important(str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': BACKUP File: Successfully backed up old model "' + str(name_dict_obj) + '" to filepath "' + str(filepath_old) + '"', log_list=log_training) os.rename(src=filepath_tmp, dst=filepath) Log.important(str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': REAL File: Saved "' + str(name_dict_obj) + '" with ' + str(len(dict_obj.keys())) + ' lines,' + ' filepath "' + str(filepath) + '"', log_list=log_training) except Exception as ex: errmsg =\ str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': REAL File: For object "' + str(name_dict_obj)\ + '" could not rename tmp file "' + str(filepath_tmp)\ + '" to file "' + str(filepath)\ + '". ' + str(ex) Log.error(s=errmsg, log_list=log_training) raise Exception(errmsg)