class FileReader(QWidget): def __init__(self): super().__init__() self.__logger = Logger() self.__file_dialog = QFileDialog() self.__logger.info('FileReader was successfully initialized.', __name__) def _detect_encoding(self, filename): with open(filename, 'rb') as byte_file: byte_string = byte_file.read() encoding = chardet.detect(byte_string)['encoding'] self.__logger.info(f"File's encoding: {encoding}", __name__) return encoding def get_file_content(self): try: filename = self.__file_dialog.getOpenFileName( self, 'Open file', '/home')[0] self.__logger.info(f'Filename: {filename}', __name__) if filename: with open(filename, 'r', encoding=self._detect_encoding(filename)) as file: return file.read() except BaseException as exception: self.__logger.error(str(exception), __name__)
class SpeechRecognizer: def __init__(self): # Services self.__recognizer = sr.Recognizer() self.__logger = Logger() self._exceptions_handler = ExceptionsHandler() self.__logger.info('SpeechRecognizer was successfully initialized.', __name__) def recognize_speech(self): while True: try: with sr.Microphone() as source: speech = self.__recognizer.listen(source) except BaseException as exception: error_message = self._exceptions_handler.get_error_message(exception) self.__logger.error(error_message, __name__) return error_message try: text = self.__recognizer.recognize_google(speech, language="ru-RU").lower().strip() return text except BaseException as exception: error_message = self._exceptions_handler.get_error_message(exception) if isinstance(exception, sr.WaitTimeoutError): self.__logger.warning(self._exceptions_handler.get_error_message(exception), __name__) else: self.__logger.error(error_message, __name__) return error_message
class SpellChecker: def __init__(self): # Services self.__logger = Logger() self._exceptions_handler = ExceptionsHandler() self.__logger.info('SpellChecker was successfully initialized.', __name__) def check_spelling(self, text): self.__logger.info(f'Start text: {text}', __name__) try: response = requests.get( 'https://speller.yandex.net/services/spellservice.json/checkText', params={ 'text': text }).json() except BaseException as exception: self.__logger.error( self._exceptions_handler.get_error_message(exception), __name__) return text for word in response: text = text.replace(word['word'], word['s'][0]) self.__logger.info(f'Checked text: {text}', __name__) return text
class DatabaseCursor: def __init__(self): # Services self.__logger = Logger() self._path_service = PathService() self._configurator = Configurator() self._exceptions_handler = ExceptionsHandler() # Data self._wd = os.getcwd() self._request_url = None self.databases_public_keys = None self.__logger.info('DatabaseCursor was successfully initialized.', __name__) def _load_config(self): path_to_config = os.path.join(self._path_service.path_to_configs, 'database_cursor.json') if os.path.exists(path_to_config): with open(path_to_config, 'r', encoding='utf-8') as file: config = json.load(file) self._request_url = config['request_url'] self.databases_public_keys = config['database_public_keys'] else: self.__logger.error( "Can't load config for DatabaseCursor (doesn't exist).", __name__) def __update_connection(self, ngram): path_to_db = None if ngram.count(' ') == 0: path_to_db = self._path_service.get_path_to_database('unigrams.db') elif ngram.count(' ') == 1: path_to_db = self._path_service.get_path_to_database('bigrams.db') elif ngram.count(' ') == 2: path_to_db = self._path_service.get_path_to_database('trigrams.db') if path_to_db and os.path.exists(path_to_db): self.__logger.info(f'Connected to database: {path_to_db}', __name__) return sqlite3.connect(path_to_db) else: self.__logger.warning(f'Database lost: {path_to_db}', __name__) self.__logger.info('Trying to download database from cloud...', __name__) self._configurator.download_database(path_to_db) self.__logger.info(f'Connected to database: {path_to_db}', __name__) if os.path.exists(path_to_db): return sqlite3.connect(path_to_db) else: self.__logger.fatal("Database doesn't exist.", __name__) def get_entry(self, ngram): connection = self.__update_connection(ngram) cursor = connection.cursor() request = (""" SELECT * FROM 'Data' WHERE Ngram='%s' """) % ngram self.__logger.info(f'Request to DB: {request.strip()}', __name__) try: cursor.execute(request) self.__logger.info('Request is OK.', __name__) except BaseException as exception: connection.close() self.__logger.error( self._exceptions_handler.get_error_message(exception), __name__) return result = cursor.fetchone() self.__logger.info(f'Received data: {str(result)}', __name__) if result: connection.close() return result[1], result[2] else: connection.close() def entry_exists(self, ngram): connection = self.__update_connection(ngram) cursor = connection.cursor() request = (""" SELECT * FROM 'Data' WHERE Ngram='%s' """) % ngram self.__logger.info(f'Request to DB: {request.strip()}', __name__) try: cursor.execute(request) self.__logger.info('Request is OK.', __name__) except BaseException as exception: connection.close() self.__logger.error( self._exceptions_handler.get_error_message(exception), __name__) return if cursor.fetchone(): connection.close() self.__logger.info('Entry exists.', __name__) return True else: connection.close() self.__logger.info("Entry doesn't exist.", __name__) return False
class PathService(metaclass=Singleton): def __init__(self): # Services self.__logger = Logger() # Data self._wd = os.getcwd() self.path_to_databases = None self.path_to_configs = None self._valid_classifiers = None self._valid_model_types = None self._valid_databases = None self._valid_test_results_modes = None self._valid_datasets = None self.path_to_stop_words = None self._path_to_main_directory = None self.path_to_vector_model = None self._path_to_classifier_models = None self._path_to_test_results = None self.configure() self.__logger.info('PathService was successfully configured.', __name__) def _find_main_directory(self): max_nesting_level = 5 nesting_level = 0 while not os.getcwd().endswith('Python'): if os.getcwd().endswith('Databases'): os.chdir(os.path.join('..', 'Python')) break else: os.chdir('..') nesting_level += 1 if nesting_level > max_nesting_level: self.__logger.fatal("Can't find main directory (exceeded maximum nesting level).", __name__) exit(-1) self._path_to_main_directory = os.getcwd() self.path_to_configs = os.path.join(self._path_to_main_directory, 'Services', 'Configs') self.path_to_databases = os.path.abspath(os.path.join('..', 'Databases')) os.chdir(self._wd) def _check_paths_existing(self): if not os.path.exists(self.path_to_configs): self.__logger.fatal("Directory with config files doesn't exist.", __name__) exit(-1) elif not os.path.exists(self.path_to_databases): self.__logger.fatal("Directory with databases doesn't exist.", __name__) exit(-1) elif not os.path.exists(self._path_to_classifier_models): self.__logger.fatal("Directory with classifier models doesn't exist.", __name__) exit(-1) if not os.path.exists(self.path_to_vector_model): self.path_to_vector_model = None self.__logger.error("Vector model doesn't exist.", __name__) if not os.path.exists(self.path_to_stop_words): self.path_to_stop_words = None self.__logger.error("File with stop-words doesn't exist.", __name__) if not os.path.exists(self._path_to_test_results): self._path_to_test_results = None self.__logger.warning("Directory with tests reports doesn't exist.", __name__) def _load_config(self): path_to_config = os.path.join(self.path_to_configs, 'path_service.json') if not os.path.exists(path_to_config): self.__logger.error("Can't find config-file for PathService.", __name__) with open(path_to_config, 'r', encoding='utf-8') as file: config = json.load(file) self._valid_classifiers = config['valid_classifiers'] self._valid_databases = config['valid_databases'] self._valid_datasets = config['valid_datasets'] self._valid_test_results_modes = config['valid_test_results_modes'] self._valid_model_types = config['valid_model_types'] def configure(self): self._find_main_directory() self._load_config() self.path_to_vector_model = os.path.join(self.path_to_databases, 'ruscorpora_upos_skipgram_300_10_2017.bin.gz') self.path_to_stop_words = os.path.join(self._path_to_main_directory, 'Services', 'Lemmatizer', 'stop_words.json') self._path_to_classifier_models = os.path.join(self.path_to_databases, 'Models') self._path_to_test_results = os.path.join(self._path_to_main_directory, 'Tests', 'System', 'Reports') self._check_paths_existing() def get_path_to_test_results(self, mode='classifier', classifier_name='NBC'): if classifier_name not in self._valid_classifiers: self.__logger.warning('Got incorrect classifier name.', __name__) classifier_name = 'NBC' if classifier_name not in self._valid_test_results_modes: self.__logger.warning('Got incorrect mode.', __name__) return self._path_to_test_results if mode.lower().strip() == 'vec_model': return os.path.join(self._path_to_test_results, 'VectorModel') elif mode.lower().strip() == 'classifier_main': return os.path.join(self._path_to_test_results, '..', '..', 'MainReports', 'Classifier', classifier_name) elif mode.lower().strip() == 'classifier': return self._path_to_test_results def get_path_to_model(self, model='unigrams', classifier_name='NBC'): if classifier_name not in self._valid_classifiers: self.__logger.warning('Got incorrect classifier name.', __name__) classifier_name = 'NBC' if model not in self._valid_model_types: self.__logger.warning('Got incorrect model type.', __name__) model = 'unigrams' path_to_models = os.path.join(self._path_to_classifier_models, classifier_name) if os.path.exists(path_to_models): path_to_required_model = os.path.join(path_to_models, f'model_{model}.pkl') return path_to_required_model else: self.__logger.error("Required model wasn't found.", __name__) def get_path_to_database(self, database_name='unigrams.db'): if database_name not in self._valid_databases: self.__logger.warning('Got incorrect database name.', __name__) database_name = 'unigrams.db' path_to_database = os.path.join(self.path_to_databases, database_name) return path_to_database def get_path_to_dataset(self, dataset): if dataset not in self._valid_datasets: self.__logger.warning('Got incorrect dataset name.', __name__) dataset = 'dataset_with_unigrams.csv' path_to_dataset = os.path.join(self.path_to_databases, dataset) return path_to_dataset def set_path_to_vector_model(self, path_to_vector_model): self.path_to_vector_model = path_to_vector_model
class NgramAnalyzer: def __init__(self): # Services self._database_cursor = DatabaseCursor() self.__logger = Logger() self._exceptions_hanlder = ExceptionsHandler() self._lemmatizer = Lemmatizer() self._path_service = PathService() self._configurator = Configurator() self._morph_analyzer = pymorphy2.MorphAnalyzer() # Data self._vec_model = None self._load_vec_model() self.__logger.info('NgramAnalyzer was successfully initialized.', __name__) def _load_vec_model(self): if not self._path_service.path_to_vector_model: self.__logger.warning("Vector model doesn't exist.", __name__) self._configurator.download_vector_model() self._path_service.set_path_to_vector_model(os.path.join(self._path_service.path_to_databases, 'ruscorpora_upos_skipgram_300_10_2017.bin.gz')) self.__logger.info('Vector model was successfully downloaded.', __name__) if self._path_service.path_to_vector_model: self._vec_model = gensim.models.KeyedVectors.load_word2vec_format(self._path_service.path_to_vector_model, binary=True) else: self.__logger.error("Vector model doesn't exist.", __name__) def _part_of_speech_detect(self, word): if not word: return part_of_speech = self._morph_analyzer.parse(word)[0].tag.POS if part_of_speech: if re.match(r'ADJ', part_of_speech): return 'ADJ' elif re.match(r'PRT', part_of_speech): return 'PRT' elif part_of_speech == 'INFN': return 'VERB' elif part_of_speech == 'ADVB' or part_of_speech == 'PRED': return 'ADV' elif part_of_speech == 'PRCL': return 'PART' return part_of_speech @staticmethod def _detect_ngram_type(ngram): if not ngram: return if ngram.count(' ') == 0: return 'unigram' elif ngram.count(' ') == 1: return 'bigram' elif ngram.count(' ') == 2: return 'trigram' def _nearest_synonyms_find(self, word, topn): if not self._vec_model or not word or topn <= 0: return nearest_synonyms = list() part_of_speech = self._part_of_speech_detect(word) ngram_type = self._detect_ngram_type(word) if part_of_speech: word = word + '_%s' % self._part_of_speech_detect(word) try: for synonym in self._vec_model.most_similar(positive=[word], topn=topn * 10): found_synonym = self._lemmatizer.get_text_initial_form(synonym[0].split('_')[0]) if found_synonym and self._detect_ngram_type(found_synonym) == ngram_type: nearest_synonyms.append({'word': found_synonym, 'cosine proximity': synonym[1]}) if len(nearest_synonyms) == topn: break except BaseException as exception: self.__logger.warning(self._exceptions_hanlder.get_error_message(exception), __name__) return return nearest_synonyms def relevant_ngram_find(self, ngram): if not ngram: return self.__logger.info(f'Start ngram: {ngram}', __name__) response = {'synonym_found': False, 'content': dict()} if self._detect_ngram_type(ngram) == 'unigram': synonyms_count = 10 nearest_synonyms = self._nearest_synonyms_find(ngram, synonyms_count) if not nearest_synonyms: return response for nearest_synonym in nearest_synonyms: data = self._database_cursor.get_entry(nearest_synonym['word']) if data and data[0]: self.__logger.info(f'Relevant ngram: {nearest_synonym["word"]}', __name__) response['synonym_found'] = True response['content']['synonym'] = nearest_synonym['word'] response['content']['pos_docs'] = data[0] response['content']['neg_docs'] = data[1] return response return response
class Configurator(metaclass=Singleton): def __init__(self): # Services self.__logger = Logger() self._path_service = PathService() self._exceptions_handler = ExceptionsHandler() # Data self._config = dict() self._wd = os.getcwd() self._path_to_databases = None self._request_url = None self._vector_model_public_key = None self._databases_public_keys = None self._load_public_keys() self.__logger.info('Configurator was successfully initialized.', __name__) def _load_public_keys(self): path_to_config = os.path.join(self._path_service.path_to_configs, 'configurator.json') if os.path.exists(path_to_config): with open(path_to_config, 'r', encoding='utf-8') as file: config = json.load(file) self._request_url = config['request_url'] self._vector_model_public_key = config['vector_model_public_key'] self._databases_public_keys = config['databases_public_keys'] else: self.__logger.error( "Can't load config for Configrurator (doesn't exist).", __name__) def download_database(self, path_to_db): database_name = os.path.split(path_to_db)[1] if database_name: try: download_url = requests.get( self._request_url, params={ 'public_key': self._databases_public_keys[database_name] }).json()["href"] with open(path_to_db, 'wb') as database_file: database_file.write(requests.get(download_url).content) self._config[path_to_db] = 'downloaded' except BaseException as exception: self.__logger.error( self._exceptions_handler.get_error_message(exception), __name__) self._config[path_to_db] = 'error' def download_vector_model(self): self._path_service.set_path_to_vector_model( os.path.join(self._path_service.path_to_databases, 'ruscorpora_upos_skipgram_300_10_2017.bin.gz')) try: download_url = requests.get(self._request_url, params={ 'public_key': self._vector_model_public_key }).json()["href"] with open(self._path_service.path_to_vector_model, 'wb') as vec_model: vec_model.write(requests.get(download_url).content) self._config[ 'ruscorpora_upos_skipgram_300_10_2017.bin.gz'] = 'downloaded' except BaseException as exception: self.__logger.error( self._exceptions_handler.get_error_message(exception), __name__) self._config[ 'ruscorpora_upos_skipgram_300_10_2017.bin.gz'] = 'error' def configure_system(self): self._config['datetime'] = str(datetime.now()) for database in ['unigrams.db', 'bigrams.db', 'trigrams.db']: path_to_database = self._path_service.get_path_to_database( database) if not path_to_database or not os.path.exists(path_to_database): self.__logger.warning('Database not found: %s' % str(database), __name__) self.download_database( os.path.join(self._path_service.path_to_databases, database)) else: self._config[database] = 'exists' if not self._path_service.path_to_vector_model or not os.path.exists( self._path_service.path_to_vector_model): self.__logger.warning('Vector model not found.', __name__) self.download_vector_model() else: self._config[ 'ruscorpora_upos_skipgram_300_10_2017.bin.gz'] = 'exists' self._create_config() def _create_config(self): with open(os.path.join('Logs', 'config.json'), 'w', encoding='utf-8') as config: json.dump(self._config, config, indent=4)
class Classifier: def __init__(self): # Services self.__logger = Logger() self._path_service = PathService() self._exceptions_handler = ExceptionsHandler() # Data self._container = ClassificationDataContainer() self._possible_classifiers = ['NBC', 'LogisticRegression', 'KNN'] self.__logger.info('Classifier was successfully initialized.', __name__) def _load_config(self): path_to_config = os.path.join(self._path_service.path_to_configs, 'classifier.json') if os.path.exists(path_to_config): with open(path_to_config, 'r', encoding='utf-8') as file: config = json.load(file) self._possible_classifiers = config['possible_classifiers'] else: self.__logger.error("Can't load Classifier configuration.", __name__) def customize(self, unigrams_weight, bigrams_weight, trigrams_weight, classifier_name='NBC'): self._container.clear() if classifier_name in self._possible_classifiers: self._container.classifiers['name'] = classifier_name else: self._container.classifiers['name'] = 'NBC' self.__logger.error('Got unknown classifier, set default (NBC).', __name__) self._container.weights['unigrams'] = unigrams_weight self._container.weights['bigrams'] = bigrams_weight self._container.weights['trigrams'] = trigrams_weight try: if self._container.weights['unigrams']: self._container.classifiers['unigrams'] = joblib.load( self._path_service.get_path_to_model( 'unigrams', self._container.classifiers['name'])) if self._container.weights['bigrams']: self._container.classifiers['bigrams'] = joblib.load( self._path_service.get_path_to_model( 'bigrams', self._container.classifiers['name'])) if self._container.weights['trigrams']: self._container.classifiers['trigrams'] = joblib.load( self._path_service.get_path_to_model( 'trigrams', self._container.classifiers['name'])) self.__logger.info('Models were successfully loaded.', __name__) self.__logger.info('Classifier was successfully configured.', __name__) except BaseException as exception: self.__logger.fatal( self._exceptions_handler.get_error_message(exception), __name__) def _predict_tonal_by_unigrams(self): self._container.tonalities['unigrams'] = self._container.classifiers[ 'unigrams'].predict(self._container.weights['unigrams'])[0] self._container.probabilities['unigrams'] = max( self._container.classifiers['unigrams'].predict_proba( self._container.weights['unigrams'])[0]) self.__logger.info( f'Unigrams tonal: {self._container.tonalities["unigrams"]}', __name__) self.__logger.info( f'Unigrams probability: {self._container.probabilities["unigrams"]}', __name__) def _predict_tonal_by_unigrams_bigrams(self): self._container.tonalities['bigrams'] = self._container.classifiers[ 'bigrams'].predict([[ self._container.weights['unigrams'], self._container.weights['bigrams'] ]])[0] self._container.probabilities['bigrams'] = max( self._container.classifiers['bigrams'].predict_proba([[ self._container.weights['unigrams'], self._container.weights['bigrams'] ]])[0]) self.__logger.info( f'Bigrams tonal: {self._container.tonalities["bigrams"]}', __name__) self.__logger.info( f'Bigrams probability: {self._container.probabilities["bigrams"]}', __name__) def _predict_tonal_by_unigrams_bigrams_trigrams(self): self._container.tonalities['trigrams'] = self._container.classifiers[ 'trigrams'].predict([[ self._container.weights['unigrams'], self._container.weights['bigrams'], self._container.weights['trigrams'] ]])[0] self._container.probabilities['trigrams'] = max( self._container.classifiers['trigrams'].predict_proba([[ self._container.weights['unigrams'], self._container.weights['bigrams'], self._container.weights['trigrams'] ]])[0]) self.__logger.info( f'Trigrams tonal: {self._container.tonalities["trigrams"]}', __name__) self.__logger.info( f'Trigrams probability: {self._container.probabilities["trigrams"]}', __name__) def _predict_intermediate_tonalities(self): threads = list() if self._container.weights['unigrams']: threads.append( Thread(target=self._predict_tonal_by_unigrams, args=())) if self._container.weights['bigrams']: threads.append( Thread(target=self._predict_tonal_by_unigrams_bigrams, args=())) if self._container.weights['trigrams']: threads.append( Thread(target=self._predict_tonal_by_unigrams_bigrams_trigrams, args=())) for thread in threads: thread.start() for thread in threads: while thread.is_alive(): time.sleep(0.1) thread.join() def _select_final_tonal(self): if self._container.tonalities['unigrams'] and self._container.tonalities['bigrams'] and \ self._container.tonalities['trigrams']: if self._container.tonalities[ 'unigrams'] == self._container.tonalities['bigrams']: self._container.tonalities[ 'final'] = self._container.tonalities['unigrams'] self._container.probabilities['final'] = max( self._container.probabilities['unigrams'], self._container.probabilities['bigrams']) elif self._container.tonalities[ 'unigrams'] == self._container.tonalities['trigrams']: self._container.tonalities[ 'final'] = self._container.tonalities['unigrams'] self._container.probabilities['final'] = max( self._container.probabilities['unigrams'], self._container.probabilities['trigrams']) elif self._container.tonalities[ 'bigrams'] == self._container.tonalities['trigrams']: self._container.tonalities[ 'final'] = self._container.tonalities['bigrams'] self._container.probabilities['final'] = max( self._container.probabilities['bigrams'], self._container.probabilities['trigrams']) elif self._container.tonalities[ 'unigrams'] and self._container.tonalities['bigrams']: if self._container.tonalities[ 'unigrams'] != self._container.tonalities['bigrams']: if self._container.probabilities[ 'unigrams'] >= self._container.probabilities['bigrams']: self._container.tonalities[ 'final'] = self._container.tonalities['unigrams'] self._container.probabilities[ 'final'] = self._container.probabilities['unigrams'] else: self._container.tonalities[ 'final'] = self._container.tonalities['bigrams'] self._container.probabilities[ 'final'] = self._container.probabilities['bigrams'] elif self._container.tonalities[ 'unigrams'] == self._container.tonalities['bigrams']: self._container.tonalities[ 'final'] = self._container.tonalities['unigrams'] self._container.probabilities['final'] = max( self._container.probabilities['bigrams'], self._container.probabilities['unigrams']) elif self._container.tonalities['unigrams']: self._container.tonalities['final'] = self._container.tonalities[ 'unigrams'] self._container.probabilities[ 'final'] = self._container.probabilities['unigrams'] def predict_tonal(self): self._predict_intermediate_tonalities() self._select_final_tonal() self.__logger.info( f'Final tonal: {self._container.tonalities["final"]}', __name__) self.__logger.info( f'Final probability: {self._container.probabilities["final"]}', __name__) return self._container.tonalities[ 'final'], self._container.probabilities['final']