def create_session_configs(configs_location=None, delete_old_configs=1): if configs_location is None: configs_location = ConfigReader.get_configs_location() if delete_old_configs: Storage.delete_location(configs_location) configs = [SessionConfigReader.get_config_template()] configs = SessionConfigBuilderCustom1.add_all_config_info(configs) n_configs = len(configs) SessionLogger.log('Constructed ' + str(n_configs) + ' new session configs from template: \'' + ConfigReader.get_config_template_id() + '\'.') config_ids = list() idx = 0 for conf in configs: config_id = configs_location + '/' + SessionConfigBuilderCustom1.config_name + str( idx + 1) SessionConfigReader.set_config(conf, config_id) config_ids.append(config_id) idx = idx + 1 SessionLogger.log('Stored ' + str(n_configs) + ' session configs in \'' + configs_location + '\'.') return config_ids
def remove_stopwords(data_frame, custom_stop_words=None, download_live_stopwords=0, col_name=col_name, storage_level=0, storage_name='', log=1): stopwordremover = SessionConfigReader.read_value( StopWordRemover.stopwordremover_key) if custom_stop_words is None: custom_stop_words = list() custom_stop_words.extend( SessionConfigReader.read_value(StopWordRemover.custom_sw_key)) if stopwordremover == StopWordRemover.stopwordremover_custom: return StopWordRemoverCustom.remove_stopwords( data_frame, custom_stop_words=custom_stop_words, download_live_stopwords=download_live_stopwords, col_name=col_name, storage_level=storage_level, storage_name=storage_name, log=log) else: if log: SessionLogger.log( 'Tried to remove stopwords from documents. Specified Stopword Remover not supported.', log_type='error') return pd.DataFrame()
def run_classification_test(): corpus_id = SessionConfigReader.read_value(SetupRunner.corpus_id_key) vectorized_df_id = corpus_id + SetupRunner.ext_vectorized train_df_id = vectorized_df_id + SetupRunner.ext_train test_df_id = vectorized_df_id + SetupRunner.ext_test Storage.delete_pd_frame(train_df_id) Storage.delete_pd_frame(test_df_id) Storage.delete_h5_model(SessionConfigReader.read_value(SetupRunner.keras_nn_model_id_key)) vectorized_df = Storage.load_pd_frame(vectorized_df_id) TrainTestSplitter.split_train_test(identifier=vectorized_df_id, data_frame=vectorized_df) train_df_id = vectorized_df_id + SetupRunner.ext_train train = Storage.load_pd_frame(train_df_id) test_df_id = vectorized_df_id + SetupRunner.ext_test test = Storage.load_pd_frame(test_df_id) train_classification_outs = ClassificationInterpreter.create_out_vectors(train) Classifier.create_model(train_classification_outs) test_classified = Classifier.classify(test) test_interpreted = ClassificationInterpreter.interpret_output(test_classified) score = ClassificationInterpreter.evaluate_output(test_interpreted) EvaluationHandler.add_evaluation(score) return test_interpreted
def split_train_test(identifier=None, data_frame=None): if data_frame is None: data_frame = Storage.load_pd_frame(identifier) split_ratio = SessionConfigReader.read_value( TrainTestSplitterCustom1.split_ratio_key) if split_ratio > 1: split_ratio = 1 random_state = SessionConfigReader.read_value( TrainTestSplitterCustom1.random_state_key) if isinstance(random_state, int): train = data_frame.sample(frac=split_ratio, random_state=random_state) else: train = data_frame.sample(frac=split_ratio) test = data_frame.drop(train.index) if identifier is None: identifier = SessionConfigReader.read_value( TrainTestSplitterCustom1.corpus_identifier_key) train_name = identifier + TrainTestSplitterCustom1.ext_train test_name = identifier + TrainTestSplitterCustom1.ext_test Storage.store_pd_frame(train, train_name) Storage.store_pd_frame(test, test_name) SessionLogger.log('Split \'' + identifier + '\' (' + str(len(data_frame.index)) + ' entries) into \'' + train_name + '\' (' + str(len(train.index)) + ' entries) and \'' + test_name + '\' (' + str(len(test.index)) + ' entries).')
def import_docs(csv_path=None): if csv_path is None: session_folder = os.path.join(TenKGnadImporter.sessions_folder, SessionConfigReader.get_session_id()) corpus_id = SessionConfigReader.read_value( TenKGnadImporter.corpus_id_key) corpus_id = DiskStorageMisc.get_identifier_path(corpus_id) csv_path = os.path.join(session_folder, corpus_id + TenKGnadImporter.csv_ext) df = pd.read_csv( csv_path, sep=';', quotechar='\'', quoting=csv.QUOTE_MINIMAL, header=None, names=[TenKGnadImporter.category_name, TenKGnadImporter.text_name]) category_list = df[TenKGnadImporter.category_name].tolist() df[TenKGnadImporter.category_name] = df.apply( lambda x: [x[TenKGnadImporter.category_name]], axis=1) head, f_name = os.path.split(csv_path) identifier = f_name.split('.')[0] Storage.store_pd_frame(df, identifier) SessionLogger.log('TenKGnad Corpus (' + str(len(df.index)) + ' entries) has been imported into \'' + identifier + '\' (columns: \'' + TenKGnadImporter.category_name + '\', \'' + TenKGnadImporter.text_name + '\').') category_set = set(category_list) category_list = list(category_set) CategoryListHandler.set_categories(category_list) return identifier
def set_best_performing(eval_session_id=None): evals = EvaluationHandler.load_evaluations(session_id=eval_session_id) evals.sort_values(by=[EvaluationHandler.score_col], ascending=False) if evals.size > 0: session_id = evals.at[0, EvaluationHandler.session_id_col] config_id = evals.at[0, EvaluationHandler.config_id_col] SessionConfigReader.set_best_performing_by_ids( session_id=session_id, config_id=config_id) else: SessionConfigReader.set_best_performing_by_ids()
def compare_evaluations(session_ids=None, remove_cols=None, add_cols=None): all_evals = pd.DataFrame() if session_ids is None: all_evals = EvaluationHandler.load_evaluations() else: for session_id in session_ids: all_evals = all_evals.concat( EvaluationHandler.load_evaluations(session_id=session_id), sort=False, ignore_index=True) all_evals = all_evals.sort_values(by=[EvaluationHandler.score_col], ascending=False) i = 0 while i < len(all_evals): session_id = all_evals.at[i, EvaluationHandler.session_id_col] conf_id = all_evals.at[i, EvaluationHandler.config_id_col] conf = SessionConfigReader.get_config(session_id=session_id, config_id=conf_id) for key in EvaluationHandler.additional_columns: if key in conf: value = conf[key][0] else: value = '' all_evals.at[i, key] = '' all_evals.at[i, key] = value i = i + 1 if remove_cols is not None: for key in remove_cols: if key in all_evals: all_evals = all_evals.drop(columns=[key]) if add_cols is not None: i = 0 while i < len(all_evals): session_id = all_evals.at[i, EvaluationHandler.session_id_col] conf_id = all_evals.at[i, EvaluationHandler.config_id_col] conf = SessionConfigReader.get_config(session_id=session_id, config_id=conf_id) for key in add_cols: if key in conf: value = conf[key][0] else: value = '' all_evals.at[i, key] = '' all_evals.at[i, key] = value i = i + 1 return all_evals
def create_out_vectors(data_frame, col_name=col_name_categories, new_col_name=new_col_name_cat_vec, storage_level=0, storage_name=''): classification_interpreter = SessionConfigReader.read_value( ClassificationInterpreter.classification_interpreter_key) if classification_interpreter == ClassificationInterpreter.classification_interpreter_custom1: return ClassificationInterpreterCustom1.create_out_vectors( data_frame, col_name=col_name, new_col_name=new_col_name, storage_level=storage_level, storage_name=storage_name) elif classification_interpreter == ClassificationInterpreter.classification_interpreter_custom2: return ClassificationInterpreterCustom2.create_out_vectors( data_frame, col_name=col_name, new_col_name=new_col_name, storage_level=storage_level, storage_name=storage_name) else: SessionLogger.log( 'Tried to create category vectors. Specified ClassificationInterpreter is not supported.', log_type='error') return pd.DataFrame()
def get_stopwords(): stopwordremover = SessionConfigReader.read_value( StopWordRemover.stopwordremover_key) if stopwordremover == StopWordRemover.stopwordremover_custom: return StopWordRemoverCustom.get_stopwords() else: return set()
def interpret_output(data_frame, col_name=col_name_class_out, new_col_name=col_name_result, storage_level=0, storage_name='', log=1): classification_interpreter = SessionConfigReader.read_value( ClassificationInterpreter.classification_interpreter_key) if classification_interpreter == ClassificationInterpreter.classification_interpreter_custom1: return ClassificationInterpreterCustom1.interpret_output( data_frame, col_name=col_name, new_col_name=new_col_name, storage_level=storage_level, storage_name=storage_name, log=log) elif classification_interpreter == ClassificationInterpreter.classification_interpreter_custom2: return ClassificationInterpreterCustom2.interpret_output( data_frame, col_name=col_name, new_col_name=new_col_name, storage_level=storage_level, storage_name=storage_name, log=log) else: SessionLogger.log( 'Tried to interpret output vectors. Specified ClassificationInterpreter is not supported.', log_type='error') return pd.DataFrame()
def normalize(data_frame, col_name=col_name, storage_level=0, storage_name='', log=1): lemmatizer = SessionConfigReader.read_value(Lemmatizer.lemmatizer_key) if lemmatizer == Lemmatizer.lemmatizer_spacy_german: return LemmatizerSpacyGerman.normalize(data_frame, col_name=col_name, storage_level=storage_level, storage_name=storage_name, log=log) else: if log: SessionLogger.log('Tried to lemmatize documents. Specified Lemmatizer not supported.', log_type='error') return pd.DataFrame()
def get_word_vectors(words, model_id=''): vectorizer_type = SessionConfigReader.read_value( Vectorizer.vectorizer_key) if vectorizer_type == Vectorizer.vectorizer_gensim_w2v: return VectorizerGensimWord2Vec.get_word_vectors(words, model_id=model_id) else: return list()
def preprocess_texts(data_frame, col_name=col_name, new_col_name=new_col_name, storage_level=0, storage_name='', log=1): preprocessor_type = SessionConfigReader.read_value(TextPreprocessor.preprocessor_key) if preprocessor_type == TextPreprocessor.preprocessor_custom: return TextPreprocessorCustom.preprocess_texts(data_frame, col_name=col_name, new_col_name=new_col_name, storage_level=storage_level, storage_name=storage_name, log=log) else: if log: SessionLogger.log('Tried to preprocess texts. Specified Preprocessor is not supported.', log_type='error') return pd.DataFrame()
def read_categories(session_id): data_path = DiskStorageMisc.get_session_data_path(session_id) file_name = SessionConfigReader.read_value(DiskStorageCategoryListHandler.cat_id_key) + DiskStorageCategoryListHandler.ext_json categories_path = os.path.join(data_path, file_name) if not os.path.exists(categories_path): return list() with open(categories_path, encoding='utf8') as json_file: file = json.load(json_file) return file[DiskStorageCategoryListHandler.cat_list_key]
def get_model_id(): classifier_type = SessionConfigReader.read_value( Classifier.classifier_key) if classifier_type == Classifier.classifier_keras_nn: return ClassifierKerasNN.get_model_id() else: SessionLogger.log( 'Tried to get classifier model id. Specified Classifier is not supported.', log_type='error') return ''
def import_docs(): importer_type = SessionConfigReader.read_value( CorpusImporter.corpus_importer_key) if importer_type == CorpusImporter.tenkgnad_importer: return TenKGnadImporter.import_docs() else: SessionLogger.log( 'Tried to import corpus. Specified Corpus Importer is not supported.', log_type='error') return ''
def set_categories(session_id, categories): data = {DiskStorageCategoryListHandler.cat_list_key: []} for category in categories: data[DiskStorageCategoryListHandler.cat_list_key].append(category) data_path = DiskStorageMisc.get_session_data_path(session_id) file_name = SessionConfigReader.read_value(DiskStorageCategoryListHandler.cat_id_key) + DiskStorageCategoryListHandler.ext_json categories_path = os.path.join(data_path, file_name) DiskStorageMisc.create_data_folder(session_id) with open(categories_path, 'w+', encoding='utf8') as json_file: json.dump(data, json_file, ensure_ascii=False)
def split_train_test(identifier=None, data_frame=None): tt_splitter_type = SessionConfigReader.read_value( TrainTestSplitter.tt_splitter_key) if tt_splitter_type == TrainTestSplitter.tt_splitter_custom1: TrainTestSplitterCustom1.split_train_test(identifier=identifier, data_frame=data_frame) else: SessionLogger.log( 'Tried to split \'' + identifier + '\' into train and test set. Specified TrainTestSplitter is not supported.', log_type='error')
def create_model(data_frame, new_model_id=None, col_name=col_name): vectorizer_type = SessionConfigReader.read_value( Vectorizer.vectorizer_key) if vectorizer_type == Vectorizer.vectorizer_gensim_w2v: return VectorizerGensimWord2Vec.create_model( data_frame, new_model_id=new_model_id, col_name=col_name) else: SessionLogger.log( 'Tried to create vector model. Specified Vectorizer is not supported.', log_type='error') return ''
def run_setup(run_import=1, run_preprocessing=1, run_vectorization=1, run_classification=1): corpus_id = SessionConfigReader.read_value(SetupRunner.corpus_id_key) if run_import: Storage.delete_session_data() SessionLogger.clear() identifier = CorpusImporter.import_docs() df = Storage.load_pd_frame(identifier) StopwordDownloaderNLTK.get_stopwords() else: df = Storage.load_pd_frame(corpus_id) if run_preprocessing: df = TextPreprocessor.preprocess_texts(df, storage_level=1, storage_name=corpus_id) else: df = Storage.load_pd_frame(corpus_id+SetupRunner.ext_preprocessed) if run_vectorization: Storage.delete_model(SessionConfigReader.read_value(SetupRunner.vec_model_id_key)) Vectorizer.create_model(df) df = Vectorizer.vectorize(df, storage_level=1, storage_name=corpus_id) else: df = Storage.load_pd_frame(corpus_id+SetupRunner.ext_vectorized) if run_classification: Storage.delete_h5_model(SessionConfigReader.read_value(SetupRunner.keras_nn_model_id_key)) df = ClassificationInterpreter.create_out_vectors(df, storage_level=1, storage_name=corpus_id) Classifier.create_model(df)
def get_doc_vec(word_vectors): n_words = SessionConfigReader.read_value( WordVecToDocVecCustom.dimension_key) fv = np.zeros(n_words) idx = 0 err = 0 for vec in word_vectors: if vec is None: err = err + 1 else: fv = fv + vec idx = idx + 1 if idx == n_words: break return fv / (idx + 1 - err)
def train_model(data_frame, model_id=None, fv_col_name=fv_col_name, cat_v_col_name=cat_v_col_name): classifier_type = SessionConfigReader.read_value( Classifier.classifier_key) if classifier_type == Classifier.classifier_keras_nn: return ClassifierKerasNN.train_model(data_frame, model_id=model_id, fv_col_name=fv_col_name, cat_v_col_name=cat_v_col_name) else: SessionLogger.log( 'Tried to train classifier model. Specified Classifier is not supported.', log_type='error') return ''
def remove_noise(data_frame, col_name=col_name, storage_level=0, storage_name='', log=1): noiseremover_type = SessionConfigReader.read_value( NoiseRemover.noiseremover_key) if noiseremover_type == NoiseRemover.noiseremover_custom: return NoiseRemoverCustom.remove_noise(data_frame, col_name=col_name, storage_level=storage_level, storage_name=storage_name, log=log) else: if log: SessionLogger.log( 'Tried to remove noise from documents. Specified Noise Remover not supported.', log_type='error') return pd.DataFrame()
def evaluate_output(data_frame, col_name_categories=col_name_categories, col_name_outputs=col_name_result): classification_interpreter = SessionConfigReader.read_value( ClassificationInterpreter.classification_interpreter_key) if classification_interpreter == ClassificationInterpreter.classification_interpreter_custom1: return ClassificationInterpreterCustom1.evaluate_output( data_frame, col_name_categories=col_name_categories, col_name_outputs=col_name_outputs) elif classification_interpreter == ClassificationInterpreter.classification_interpreter_custom2: return ClassificationInterpreterCustom2.evaluate_output( data_frame, col_name_categories=col_name_categories, col_name_outputs=col_name_outputs) else: SessionLogger.log( 'Tried to evaluate classification. Specified ClassificationInterpreter is not supported.', log_type='error') return 0
class DiskStorageStopwordHandler: stpw_id_key = 'stopwords_identifier' file_name = SessionConfigReader.read_value(stpw_id_key) + '.json' sw_list_key = 'stopwords' # expects a session id # returns a string set of the current categories for the specified session @staticmethod def read_stopwords(session_id): data_path = DiskStorageMisc.get_session_data_path(session_id) stopwords_path = os.path.join(data_path, DiskStorageStopwordHandler.file_name) if not os.path.exists(stopwords_path): return set() with open(stopwords_path, encoding='utf8') as json_file: file = json.load(json_file) return set(file[DiskStorageStopwordHandler.sw_list_key]) # expects a session id and a string set of stopwords # sets the session's stopwords @staticmethod def set_stopwords(session_id, stopwords): data = {DiskStorageStopwordHandler.sw_list_key: []} for stopword in stopwords: data[DiskStorageStopwordHandler.sw_list_key].append(stopword) data_path = DiskStorageMisc.get_session_data_path(session_id) stopwords_path = os.path.join(data_path, DiskStorageStopwordHandler.file_name) DiskStorageMisc.create_data_folder(session_id) with open(stopwords_path, 'w+', encoding='utf8') as json_file: json.dump(data, json_file, ensure_ascii=False) # expects a session id and a string set of stopwords # adds stopwords to the session's stopwords @staticmethod def add_stopwords(session_id, stopwords): current_sw = DiskStorageStopwordHandler.read_stopwords(session_id) new_stopwords = current_sw.union(stopwords) DiskStorageStopwordHandler.set_stopwords(session_id, new_stopwords)
def get_highest_similarity(vec, word_list, word_vec_list): vec = vec * 2 - 1 # adjust from softmax codomain sim_func = SessionConfigReader.read_value( ClassificationInterpreterCustom1.similarity_function_key) idx = 0 sim = 0 highest_word = '' for word in word_list: word_vec = word_vec_list[idx] new_sim = 0 if word_vec is not None: if sim_func == ClassificationInterpreterCustom1.sim_func_cosine: new_sim = cosine_similarity(np.asarray([word_vec]), [vec])[0][0] elif sim_func == ClassificationInterpreterCustom1.sim_func_eucl_dist: new_sim = euclidean_distances([word_vec], [vec])[0][0] if new_sim < 0: new_sim = new_sim * -1 if new_sim > sim: sim = new_sim highest_word = word idx = idx + 1 return highest_word
def classify(data_frame, model_id=None, col_name=fv_col_name, new_col_name=class_out_col_name, storage_level=0, storage_name='', log=1): classifier_type = SessionConfigReader.read_value( Classifier.classifier_key) if classifier_type == Classifier.classifier_keras_nn: return ClassifierKerasNN.classify(data_frame, model_id=model_id, col_name=col_name, new_col_name=new_col_name, storage_level=storage_level, storage_name=storage_name, log=log) else: if log: SessionLogger.log( 'Tried to classify data. Specified Classifier is not supported.', log_type='error') return pd.DataFrame()
def train_model(data_frame, model_id=None, fv_col_name=fv_col_name, cat_v_col_name=cat_v_col_name): # read config params config_keys = list() config_keys.append(ClassifierKerasNN.model_id_key) config_keys.append(ClassifierKerasNN.epochs_key) config_keys.append(ClassifierKerasNN.batch_size_key) config = SessionConfigReader.read_values(config_keys) if model_id is None: model_id = config[0] epochs = config[1] batch_size = config[2] # extract vector lists from data frame doc_vectors = data_frame[fv_col_name].tolist() cat_vectors = data_frame[cat_v_col_name].tolist() # load the model model = Storage.load_h5_model(model_id) # train the model model.fit(np.asarray(doc_vectors), np.asarray(cat_vectors), epochs=epochs, batch_size=batch_size) # store the model Storage.store_h5_model(model, model_id) # make log entry SessionLogger.log('Trained keras neural network \'' + model_id + '\' with ' + str(len(data_frame.index)) + ' new entries.') return model_id
def interpret_output(data_frame, col_name=col_name_class_out, new_col_name=col_name_result, storage_level=0, storage_name='', log=1): category_list = CategoryListHandler.read_categories() threshold = SessionConfigReader.read_value( ClassificationInterpreterCustom2.threshold_key) data_frame[new_col_name] = data_frame.apply( lambda x: ClassificationInterpreterCustom2.get_categories_from_vec( x[col_name], category_list, threshold), axis=1) log_text = 'Categories have been determined (' + str( len(data_frame.index)) + ' entries).' if storage_level >= 1 and storage_name != '': storage_name = storage_name + ClassificationInterpreterCustom2.ext_categorized Storage.store_pd_frame(data_frame, storage_name) log_text = log_text + ' Stored in \'' + storage_name + '\' (column: \'' + new_col_name + '\').' if log: SessionLogger.log(log_text) return data_frame
def vectorize(data_frame, model_id=None, col_name=col_name, new_col_name=new_col_name, storage_level=0, storage_name='', log=1): vectorizer_type = SessionConfigReader.read_value( Vectorizer.vectorizer_key) if vectorizer_type == Vectorizer.vectorizer_gensim_w2v: return VectorizerGensimWord2Vec.vectorize( data_frame, model_id, col_name, new_col_name, storage_level=storage_level, storage_name=storage_name, log=log) else: if log: SessionLogger.log( 'Tried to vectorize texts. Specified Vectorizer is not supported.', log_type='error') return pd.DataFrame()