def create_session_configs(configs_location=None, delete_old_configs=1): if configs_location is None: configs_location = ConfigReader.get_configs_location() if delete_old_configs: Storage.delete_location(configs_location) configs = [SessionConfigReader.get_config_template()] configs = SessionConfigBuilderCustom1.add_all_config_info(configs) n_configs = len(configs) SessionLogger.log('Constructed ' + str(n_configs) + ' new session configs from template: \'' + ConfigReader.get_config_template_id() + '\'.') config_ids = list() idx = 0 for conf in configs: config_id = configs_location + '/' + SessionConfigBuilderCustom1.config_name + str( idx + 1) SessionConfigReader.set_config(conf, config_id) config_ids.append(config_id) idx = idx + 1 SessionLogger.log('Stored ' + str(n_configs) + ' session configs in \'' + configs_location + '\'.') return config_ids
def interpret_output(data_frame, col_name=col_name_class_out, new_col_name=col_name_result, storage_level=0, storage_name='', log=1): df = data_frame.copy() category_list = CategoryListHandler.read_categories() category_vectors = Vectorizer.get_word_vectors(category_list) df[new_col_name] = df.apply(lambda x: [ ClassificationInterpreterCustom1.get_highest_similarity( x[col_name], category_list, category_vectors) ], axis=1) log_text = 'Categories have been determined (' + str(len( df.index)) + ' entries).' if storage_level >= 1 and storage_name != '': storage_name = storage_name + ClassificationInterpreterCustom1.ext_categorized Storage.store_pd_frame(df, storage_name) log_text = log_text + ' Stored in \'' + storage_name + '\' (column: \'' + new_col_name + '\').' if log: SessionLogger.log(log_text) return df
def add_stopwords(stopwords): db_type = ConfigReader.get_db_type() session_id = ConfigReader.get_session_id() if db_type == StopwordHandler.db_type_fs: DiskStorageStopwordHandler.add_stopwords(session_id, stopwords) SessionLogger.log( str(len(stopwords)) + ' stop words have been added.')
def create_out_vectors(data_frame, col_name=col_name_categories, new_col_name=new_col_name_cat_vec, storage_level=0, storage_name=''): classification_interpreter = SessionConfigReader.read_value( ClassificationInterpreter.classification_interpreter_key) if classification_interpreter == ClassificationInterpreter.classification_interpreter_custom1: return ClassificationInterpreterCustom1.create_out_vectors( data_frame, col_name=col_name, new_col_name=new_col_name, storage_level=storage_level, storage_name=storage_name) elif classification_interpreter == ClassificationInterpreter.classification_interpreter_custom2: return ClassificationInterpreterCustom2.create_out_vectors( data_frame, col_name=col_name, new_col_name=new_col_name, storage_level=storage_level, storage_name=storage_name) else: SessionLogger.log( 'Tried to create category vectors. Specified ClassificationInterpreter is not supported.', log_type='error') return pd.DataFrame()
def import_docs(csv_path=None): if csv_path is None: session_folder = os.path.join(TenKGnadImporter.sessions_folder, SessionConfigReader.get_session_id()) corpus_id = SessionConfigReader.read_value( TenKGnadImporter.corpus_id_key) corpus_id = DiskStorageMisc.get_identifier_path(corpus_id) csv_path = os.path.join(session_folder, corpus_id + TenKGnadImporter.csv_ext) df = pd.read_csv( csv_path, sep=';', quotechar='\'', quoting=csv.QUOTE_MINIMAL, header=None, names=[TenKGnadImporter.category_name, TenKGnadImporter.text_name]) category_list = df[TenKGnadImporter.category_name].tolist() df[TenKGnadImporter.category_name] = df.apply( lambda x: [x[TenKGnadImporter.category_name]], axis=1) head, f_name = os.path.split(csv_path) identifier = f_name.split('.')[0] Storage.store_pd_frame(df, identifier) SessionLogger.log('TenKGnad Corpus (' + str(len(df.index)) + ' entries) has been imported into \'' + identifier + '\' (columns: \'' + TenKGnadImporter.category_name + '\', \'' + TenKGnadImporter.text_name + '\').') category_set = set(category_list) category_list = list(category_set) CategoryListHandler.set_categories(category_list) return identifier
def remove_stopwords(data_frame, custom_stop_words=None, download_live_stopwords=0, col_name=col_name, storage_level=0, storage_name='', log=1): df = data_frame.copy() stop_words = StopwordHandler.read_stopwords() if download_live_stopwords: stop_words = stop_words.union( StopwordDownloaderNLTK.get_stopwords(store=0)) stop_words = StopWordRemoverCustom.capitalize_words(stop_words) if custom_stop_words is not None: stop_words = stop_words.union(custom_stop_words) df[StopWordRemoverCustom.new_col_name] = df.apply( lambda x: StopWordRemoverCustom.process_text( x[col_name], stop_words), axis=1) log_text = 'Removed stop words from documents (' + str(len( df.index)) + ' entries).' if storage_level >= 1 and storage_name != '': Storage.store_pd_frame(df, storage_name) log_text = log_text + ' Stored in \'' + storage_name + '\' (column: \'' + StopWordRemoverCustom.new_col_name + '\').' if log: SessionLogger.log(log_text) return df
def create_out_vectors(data_frame, col_name=col_name_categories, new_col_name=new_col_name_cat_vec, storage_level=0, storage_name=''): df = data_frame.copy() df[ClassificationInterpreterCustom1.one_word_cat] = df.apply( lambda x: ClassificationInterpreterCustom1.extract_one_word_cat(x[ col_name]), axis=1) vectorized_df = Vectorizer.vectorize( df, col_name=ClassificationInterpreterCustom1.one_word_cat, new_col_name=new_col_name, storage_level=0, log=0) vectorized_df = vectorized_df.drop( columns=[ClassificationInterpreterCustom1.one_word_cat]) vectorized_df[new_col_name] = vectorized_df.apply( lambda x: (x[new_col_name] + 1) / 2, axis=1) # adjust to softmax codomain log_text = 'Category vectors for classifier training have been created (' + str( len(data_frame.index)) + ' entries).' if storage_level >= 1 and storage_name != '': storage_name = storage_name + ClassificationInterpreterCustom1.ext_out_vecs Storage.store_pd_frame(vectorized_df, storage_name) log_text = log_text + ' Stored in \'' + storage_name + '\' (column: \'' + new_col_name + '\').' SessionLogger.log(log_text) return vectorized_df
def remove_stopwords(data_frame, custom_stop_words=None, download_live_stopwords=0, col_name=col_name, storage_level=0, storage_name='', log=1): stopwordremover = SessionConfigReader.read_value( StopWordRemover.stopwordremover_key) if custom_stop_words is None: custom_stop_words = list() custom_stop_words.extend( SessionConfigReader.read_value(StopWordRemover.custom_sw_key)) if stopwordremover == StopWordRemover.stopwordremover_custom: return StopWordRemoverCustom.remove_stopwords( data_frame, custom_stop_words=custom_stop_words, download_live_stopwords=download_live_stopwords, col_name=col_name, storage_level=storage_level, storage_name=storage_name, log=log) else: if log: SessionLogger.log( 'Tried to remove stopwords from documents. Specified Stopword Remover not supported.', log_type='error') return pd.DataFrame()
def split_train_test(identifier=None, data_frame=None): if data_frame is None: data_frame = Storage.load_pd_frame(identifier) split_ratio = SessionConfigReader.read_value( TrainTestSplitterCustom1.split_ratio_key) if split_ratio > 1: split_ratio = 1 random_state = SessionConfigReader.read_value( TrainTestSplitterCustom1.random_state_key) if isinstance(random_state, int): train = data_frame.sample(frac=split_ratio, random_state=random_state) else: train = data_frame.sample(frac=split_ratio) test = data_frame.drop(train.index) if identifier is None: identifier = SessionConfigReader.read_value( TrainTestSplitterCustom1.corpus_identifier_key) train_name = identifier + TrainTestSplitterCustom1.ext_train test_name = identifier + TrainTestSplitterCustom1.ext_test Storage.store_pd_frame(train, train_name) Storage.store_pd_frame(test, test_name) SessionLogger.log('Split \'' + identifier + '\' (' + str(len(data_frame.index)) + ' entries) into \'' + train_name + '\' (' + str(len(train.index)) + ' entries) and \'' + test_name + '\' (' + str(len(test.index)) + ' entries).')
def interpret_output(data_frame, col_name=col_name_class_out, new_col_name=col_name_result, storage_level=0, storage_name='', log=1): classification_interpreter = SessionConfigReader.read_value( ClassificationInterpreter.classification_interpreter_key) if classification_interpreter == ClassificationInterpreter.classification_interpreter_custom1: return ClassificationInterpreterCustom1.interpret_output( data_frame, col_name=col_name, new_col_name=new_col_name, storage_level=storage_level, storage_name=storage_name, log=log) elif classification_interpreter == ClassificationInterpreter.classification_interpreter_custom2: return ClassificationInterpreterCustom2.interpret_output( data_frame, col_name=col_name, new_col_name=new_col_name, storage_level=storage_level, storage_name=storage_name, log=log) else: SessionLogger.log( 'Tried to interpret output vectors. Specified ClassificationInterpreter is not supported.', log_type='error') return pd.DataFrame()
def preprocess_texts(data_frame, col_name=col_name, new_col_name=new_col_name, storage_level=0, storage_name='', log=1): preprocessor_type = SessionConfigReader.read_value(TextPreprocessor.preprocessor_key) if preprocessor_type == TextPreprocessor.preprocessor_custom: return TextPreprocessorCustom.preprocess_texts(data_frame, col_name=col_name, new_col_name=new_col_name, storage_level=storage_level, storage_name=storage_name, log=log) else: if log: SessionLogger.log('Tried to preprocess texts. Specified Preprocessor is not supported.', log_type='error') return pd.DataFrame()
def normalize(data_frame, col_name=col_name, storage_level=0, storage_name='', log=1): lemmatizer = SessionConfigReader.read_value(Lemmatizer.lemmatizer_key) if lemmatizer == Lemmatizer.lemmatizer_spacy_german: return LemmatizerSpacyGerman.normalize(data_frame, col_name=col_name, storage_level=storage_level, storage_name=storage_name, log=log) else: if log: SessionLogger.log('Tried to lemmatize documents. Specified Lemmatizer not supported.', log_type='error') return pd.DataFrame()
def get_stopwords(language=german, store=1, log=1): nltk.download('stopwords') sw = set(stopwords.words(language)) log_text = str(len(sw)) + ' stop words downloaded from NLTK.' if log: SessionLogger.log(log_text) if store: StopwordHandler.set_stopwords(sw) return sw
def get_model_id(): classifier_type = SessionConfigReader.read_value( Classifier.classifier_key) if classifier_type == Classifier.classifier_keras_nn: return ClassifierKerasNN.get_model_id() else: SessionLogger.log( 'Tried to get classifier model id. Specified Classifier is not supported.', log_type='error') return ''
def remove_noise(data_frame, col_name=col_name, storage_level=0, storage_name='', log=1): df = data_frame.copy() df[NoiseRemoverCustom.new_col_name] = df.apply(lambda x: NoiseRemoverCustom.process_text(x[col_name]), axis=1) log_text = 'Removed noise from documents (' + str(len(df.index)) + ' entries).' if storage_level >= 1 and storage_name != '': Storage.store_pd_frame(df, storage_name) log_text = log_text + ' Stored in \'' + storage_name + '\' (column: \'' + NoiseRemoverCustom.new_col_name + '\').' if log: SessionLogger.log(log_text) return df
def import_docs(): importer_type = SessionConfigReader.read_value( CorpusImporter.corpus_importer_key) if importer_type == CorpusImporter.tenkgnad_importer: return TenKGnadImporter.import_docs() else: SessionLogger.log( 'Tried to import corpus. Specified Corpus Importer is not supported.', log_type='error') return ''
def preprocess_texts(data_frame, col_name=col_name, new_col_name=col_name_preprocessed, storage_level=0, storage_name='', log=1): storage_name_ext = storage_name if storage_name != '': storage_name_ext = storage_name + TextPreprocessorCustom.ext_noise_removed noise_removed_df = NoiseRemover.remove_noise( data_frame, col_name=col_name, storage_level=storage_level - 1, storage_name=storage_name_ext, log=log) if storage_name != '': storage_name_ext = storage_name + TextPreprocessorCustom.ext_stops_removed stops_removed_df = StopWordRemover.remove_stopwords( noise_removed_df, col_name=TextPreprocessorCustom.col_name_noise_removed, storage_level=storage_level - 1, storage_name=storage_name_ext, log=log) if storage_name != '': storage_name_ext = storage_name + TextPreprocessorCustom.ext_lemmatized processed_texts_df = Lemmatizer.normalize( stops_removed_df, col_name=TextPreprocessorCustom.col_name_stops_removed, storage_level=storage_level - 1, storage_name=storage_name_ext, log=log) if storage_level <= 1: processed_texts_df = processed_texts_df.drop( columns=[TextPreprocessorCustom.col_name_noise_removed]) processed_texts_df = processed_texts_df.drop( columns=[TextPreprocessorCustom.col_name_stops_removed]) processed_texts_df = processed_texts_df.rename( columns={TextPreprocessorCustom.col_name_lemmatized: new_col_name}) log_text = 'Documents have been preprocessed (' + str( len(data_frame.index)) + ' entries).' if storage_level >= 1 and storage_name != '': Storage.store_pd_frame( processed_texts_df, storage_name + TextPreprocessorCustom.ext_preprocessed) log_text = log_text + ' Stored in \'' + storage_name + TextPreprocessorCustom.ext_preprocessed + '\' (column: \'' + new_col_name + '\').' if log: SessionLogger.log(log_text) return processed_texts_df
def create_model(data_frame, new_model_id=None, col_name=col_name): vectorizer_type = SessionConfigReader.read_value( Vectorizer.vectorizer_key) if vectorizer_type == Vectorizer.vectorizer_gensim_w2v: return VectorizerGensimWord2Vec.create_model( data_frame, new_model_id=new_model_id, col_name=col_name) else: SessionLogger.log( 'Tried to create vector model. Specified Vectorizer is not supported.', log_type='error') return ''
def split_train_test(identifier=None, data_frame=None): tt_splitter_type = SessionConfigReader.read_value( TrainTestSplitter.tt_splitter_key) if tt_splitter_type == TrainTestSplitter.tt_splitter_custom1: TrainTestSplitterCustom1.split_train_test(identifier=identifier, data_frame=data_frame) else: SessionLogger.log( 'Tried to split \'' + identifier + '\' into train and test set. Specified TrainTestSplitter is not supported.', log_type='error')
def delete_from_folder(path): if os.path.exists(path): for filename in os.listdir(path): file_path = os.path.join(path, filename) try: if os.path.isfile(file_path) or os.path.islink(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception as e: SessionLogger.log('Failed to delete %s. Reason: %s' % (file_path, e))
def resume_config_tests_at_idx(idx, run_import=0, run_preprocessing=0, run_vectorization=0): SessionLogger.log('Resuming config tests at config # ' + str(idx) + ' ...') if idx > 0: idx = idx - 1 configs_location = SessionConfigBuilder.get_configs_location() config_ids = Storage.list_ids(configs_location) config_ids = SetupRunner.sort_config_list(config_ids) config_ids_with_dir = list() for c_id in config_ids: config_ids_with_dir.append(configs_location + '/' + c_id) SessionLogger.log('Config ID list has been restored.') return SetupRunner.run_config_tests(run_import=run_import, run_preprocessing=run_preprocessing, run_vectorization=run_vectorization, config_ids=config_ids_with_dir, resume_at_idx=idx)
def normalize(data_frame, col_name=col_name, storage_level=0, storage_name='', log=1): df = data_frame.copy() df[LemmatizerSpacyGerman.new_col_name] = df.apply( lambda x: LemmatizerSpacyGerman.process_text(x[col_name]), axis=1) log_text = 'Documents lemmatized with spacy (' + str(len( df.index)) + ' entries).' if storage_level >= 1 and storage_name != '': Storage.store_pd_frame(df, storage_name) log_text = log_text + ' Stored in \'' + storage_name + '\' (column: \'' + LemmatizerSpacyGerman.new_col_name + '\').' if log: SessionLogger.log(log_text) return df
def train_model(data_frame, model_id=None, fv_col_name=fv_col_name, cat_v_col_name=cat_v_col_name): classifier_type = SessionConfigReader.read_value( Classifier.classifier_key) if classifier_type == Classifier.classifier_keras_nn: return ClassifierKerasNN.train_model(data_frame, model_id=model_id, fv_col_name=fv_col_name, cat_v_col_name=cat_v_col_name) else: SessionLogger.log( 'Tried to train classifier model. Specified Classifier is not supported.', log_type='error') return ''
def remove_noise(data_frame, col_name=col_name, storage_level=0, storage_name='', log=1): noiseremover_type = SessionConfigReader.read_value( NoiseRemover.noiseremover_key) if noiseremover_type == NoiseRemover.noiseremover_custom: return NoiseRemoverCustom.remove_noise(data_frame, col_name=col_name, storage_level=storage_level, storage_name=storage_name, log=log) else: if log: SessionLogger.log( 'Tried to remove noise from documents. Specified Noise Remover not supported.', log_type='error') return pd.DataFrame()
def create_out_vectors(data_frame, col_name=col_name_categories, new_col_name=new_col_name_cat_vec, storage_level=0, storage_name=''): data_frame[new_col_name] = data_frame.apply( lambda x: ClassificationInterpreterCustom2.get_cat_vec(x[col_name] ), axis=1) log_text = 'Category vectors for classifier training have been created (' + str( len(data_frame.index)) + ' entries).' if storage_level >= 1 and storage_name != '': storage_name = storage_name + ClassificationInterpreterCustom2.ext_out_vecs Storage.store_pd_frame(data_frame, storage_name) log_text = log_text + ' Stored in \'' + storage_name + '\' (column: \'' + new_col_name + '\').' SessionLogger.log(log_text) return data_frame
def evaluate_output(data_frame, col_name_categories=col_name_categories, col_name_outputs=col_name_result): classification_interpreter = SessionConfigReader.read_value( ClassificationInterpreter.classification_interpreter_key) if classification_interpreter == ClassificationInterpreter.classification_interpreter_custom1: return ClassificationInterpreterCustom1.evaluate_output( data_frame, col_name_categories=col_name_categories, col_name_outputs=col_name_outputs) elif classification_interpreter == ClassificationInterpreter.classification_interpreter_custom2: return ClassificationInterpreterCustom2.evaluate_output( data_frame, col_name_categories=col_name_categories, col_name_outputs=col_name_outputs) else: SessionLogger.log( 'Tried to evaluate classification. Specified ClassificationInterpreter is not supported.', log_type='error') return 0
def run_config_tests(run_import=0, run_preprocessing=0, run_vectorization=0, config_ids=None, resume_at_idx=0): if config_ids is None: config_ids = SessionConfigBuilder.create_session_configs() n_configs = len(config_ids) idx = resume_at_idx while idx < len(config_ids): config_id = config_ids[idx] ConfigReader.set_session_config_id(config_id) SetupRunner.run_setup(run_import=run_import, run_preprocessing=run_preprocessing, run_vectorization=run_vectorization, run_classification=0) res = SetupRunner.run_classification_test() score = ClassificationInterpreter.evaluate_output(res) idx = idx + 1 SessionLogger.log('Evaluated config # ' + str(idx) + ' / ' + str(n_configs) + ' . Score: ' + str(score)) EvaluationHandler.sort() evaluations = EvaluationHandler.load_evaluations() return evaluations
def classify(data_frame, model_id=None, col_name=fv_col_name, new_col_name=class_out_col_name, storage_level=0, storage_name='', log=1): df = data_frame.copy() if model_id is None: model_id = ClassifierKerasNN.get_model_id() model = Storage.load_h5_model(model_id) df[new_col_name] = df.apply( lambda x: model.predict(np.asarray([x[col_name]]))[0], axis=1) log_text = 'Classified documents (' + str(len(df.index)) + ' entries).' if storage_level >= 1 and storage_name != '': storage_name = storage_name + ClassifierKerasNN.ext_classified Storage.store_pd_frame(df, storage_name) log_text = log_text + ' Stored in \'' + storage_name + '\' (column: \'' + new_col_name + '\').' if log: SessionLogger.log(log_text) return df
def classify(data_frame, model_id=None, col_name=fv_col_name, new_col_name=class_out_col_name, storage_level=0, storage_name='', log=1): classifier_type = SessionConfigReader.read_value( Classifier.classifier_key) if classifier_type == Classifier.classifier_keras_nn: return ClassifierKerasNN.classify(data_frame, model_id=model_id, col_name=col_name, new_col_name=new_col_name, storage_level=storage_level, storage_name=storage_name, log=log) else: if log: SessionLogger.log( 'Tried to classify data. Specified Classifier is not supported.', log_type='error') return pd.DataFrame()
def train_model(data_frame, model_id=None, fv_col_name=fv_col_name, cat_v_col_name=cat_v_col_name): # read config params config_keys = list() config_keys.append(ClassifierKerasNN.model_id_key) config_keys.append(ClassifierKerasNN.epochs_key) config_keys.append(ClassifierKerasNN.batch_size_key) config = SessionConfigReader.read_values(config_keys) if model_id is None: model_id = config[0] epochs = config[1] batch_size = config[2] # extract vector lists from data frame doc_vectors = data_frame[fv_col_name].tolist() cat_vectors = data_frame[cat_v_col_name].tolist() # load the model model = Storage.load_h5_model(model_id) # train the model model.fit(np.asarray(doc_vectors), np.asarray(cat_vectors), epochs=epochs, batch_size=batch_size) # store the model Storage.store_h5_model(model, model_id) # make log entry SessionLogger.log('Trained keras neural network \'' + model_id + '\' with ' + str(len(data_frame.index)) + ' new entries.') return model_id
def __init__(self, server): self.sessionLogger = SessionLogger() self.server = server self.openSessions = {}
class SessionAdmin: def __init__(self, server): self.sessionLogger = SessionLogger() self.server = server self.openSessions = {} def closeAll(self, error): self.sessionLogger.closeAllSessions(error) def newPostProcessSession(self, tutorId): session = PostProcessSession(self.server, self.sessionLogger, tutorId, POST_PROCESS) self.openSessions[tutorId] = session session.newSession() def newOfflineSession(self, tutorId, pupilId): session = OfflineQuestionSession(self.server, self.sessionLogger, tutorId, OFFLINE_QUESTION) self.openSessions[tutorId] = session session.newSession(OFFLINE_QUESTION, pupilId) def newSession(self, roomId, kind, status, pupilId="NULL"): if kind == EXTRA_IACLASS: session = ExtraSession(self.server, self.sessionLogger, roomId, kind) else: session = IAPASession(self.server, self.sessionLogger, roomId, kind) self.openSessions[roomId] = session session.newSession(status, pupilId) def closeSession(self, roomId): session = self.openSessions[roomId] session.closeSession() del(self.openSessions[roomId]) def hasOpenSession(self, roomId): return (roomId in self.openSessions.keys()) def tutorDecide(self, roomId, status=DECIDING, pupilId="NULL"): session = self.openSessions[roomId] session.tutorDecide(pupilId) def tutorEnter(self, roomId, status, pupilId="NULL"): session = self.openSessions[roomId] session.tutorEnter(status, pupilId) def tutorReject(self, roomId, status, pupilId="NULL"): session = self.openSessions[roomId] session.tutorReject(status, pupilId) def tutorEnd(self, roomId, status, pupilId="NULL"): session = self.openSessions[roomId] if session.tutorEnd(status, pupilId): del(self.openSessions[roomId]) def tutorQuit(self, roomId): session = self.openSessions[roomId] if session.tutorQuit(): del(self.openSessions[roomId]) def pupilEnter(self, roomId, pupilId): session = self.openSessions[roomId] session.pupilEnter(pupilId) def pupilEnd(self, roomId, status, pupilId="NULL"): session = self.openSessions[roomId] if session.pupilEnd(status, pupilId): del(self.openSessions[roomId]) def changeSessionKind(self, roomId, kind, pupilId=None): self.closeSession(roomId) if pupilId != None: self.newSession(roomId, kind, IACLASS, pupilId) else: self.newSession(roomId, kind, WAITING) def sessionKey(self, roomId): print self.openSessions try: session = self.openSessions[roomId] return session.sessionKey() except KeyError: return None