def __init__(self): # Read all the data we will be needing. The syllable_label_list contains a list of the used texts in [(syl, lbl), (syl,lbl), ...] format. crf_df = util.Pickle_read(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'crf_df')) # Load training and test set: X contains syllables and their features, y contains only scansion labels per line X = util.Pickle_read(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'crf_X')) y = util.Pickle_read(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'crf_y')) # Load our latest CRF model crf_model = util.Pickle_read(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'crf_model')) if self.perform_pedecerto_conversion: # Converts the pedecerto dataframe to a syllable_label_list as required by the used CRF suite texts = util.Create_files_list('./pickle', 'syllable_label') crf_df = self.convert_pedecerto_to_crf_df(texts) util.Pickle_write(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'crf_df'), crf_df) if self.perform_convert_text_to_feature_sets: # Takes the syllable label list and adds features to each syllable that are relevant for scansion X, y = self.convert_text_to_feature_sets(crf_df) util.Pickle_write(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'crf_X'), X) util.Pickle_write(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'crf_y'), y) if self.perform_fit_model: # Fit the model if needed crf_model = self.fit_model(X, y) self.print_crf_items(crf_model) util.Pickle_write(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'crf_model'), crf_model) if self.perform_kfold: # Perform kfold to check if we don't have any overfitting result = self.kfold_model(crf_df, X, y, 5) util.Pickle_write(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'crf_kfold_result'), result) print(result) if self.custom_predict: # Predict a custom sentence. NB: this has to be syllabified by the user custom_sentence = "li to ra mul tum il le et ter ris iac ta tus et al to" custom_sentence = "ar ma vi rum que ca no troi ae qui pri mus ab or is" self.predict_custom_sentence(crf_model, custom_sentence) if self.perform_grid_search: # Does what it says on the tin self.grid_search(X, y) if self.perform_prediction_df: # Creates a simple prediction dataframe used by the frontend to quickly load results self.create_prediction_df(X, y) if self.perform_experiments: self.run_experiments()
def create_prediction_df(self, X, y): # Creates a dataframe with predictions. Used by OSCC (for now) df = util.Pickle_read(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'flattened_vectors')) crf = util.Pickle_read(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'crf_model')) yhat = crf.predict(X) column_names = ["predicted", "expected"] new_df = pd.DataFrame(columns=column_names) for i in Bar('Processing').iter(range(len(y))): new_line = {'expected': y[i], 'predicted': yhat[i]} new_df = new_df.append(new_line, ignore_index=True) book_line_df = df[['book', 'line', 'syllable']] prediction_df = pd.concat([book_line_df, new_df], axis=1, join='inner') print(prediction_df) util.Pickle_write(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'seqlab_prediction_df'), prediction_df)
def grid_search(self, X, y): if int(util.cf.get('Util', 'verbose')): print('Starting Gridsearch') X_train = X[:9000] y_train = y[:9000] X_test = X[9001:] y_test = y[9001:] crf = sklearn_crfsuite.CRF(algorithm='lbfgs', max_iterations=100, all_possible_transitions=True) params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } # use the same metric for evaluation f1_scorer = metrics.make_scorer(metrics.flat_f1_score, average='weighted', labels=self.labels) # search rs = RandomizedSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=-1, n_iter=50, scoring=f1_scorer) rs.fit(X_train, y_train) print('best params:', rs.best_params_) print('best CV score:', rs.best_score_) print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000)) sorted_labels = sorted(self.labels, key=lambda name: (name[1:], name[0])) crf = rs.best_estimator_ y_pred = crf.predict(X_test) print( metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3)) util.Pickle_write(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'seq_lab_rs'), rs)
def Create_prediction_df(self, df, X, y, yhat): # Creates a dataframe with predictions. Used by OSCC (for now) column_names = ["predicted", "expected"] new_df = pd.DataFrame(columns=column_names) for i in Bar('Processing').iter(range(len(X))): new_line = {'expected': y[i], 'predicted': yhat[i]} new_df = new_df.append(new_line, ignore_index=True) book_line_df = df[['book', 'line', 'syllable']] prediction_df = pd.concat([book_line_df, new_df], axis=1, join='inner') print(prediction_df) util.Pickle_write(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'prediction_df'), prediction_df)
def create_sentence_list(self) -> list: df = util.Pickle_read(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'pedecerto_df')) # Entire Aneid is put into a list. In this list, a list is dedicated per sentence. # Each sentence list has tuples consisting of a syllable and its length. # Convert the labels from int to str df['length'] = np.where(df['length'] == 0, 'short', df['length']) df['length'] = np.where(df['length'] == 1, 'long', df['length']) df['length'] = np.where(df['length'] == 2, 'elision', df['length']) all_sentences_list = [] # Get number of books to process num_books = df['book'].max() # for i in range(num_books): for i in Bar('Processing').iter(range(num_books)): # Get only lines from this book current_book = i + 1 book_df = df.loc[df['book'] == current_book] num_lines = book_df['line'].max() for j in range(num_lines): current_line = j + 1 filtered_df = book_df[book_df["line"] == current_line] length_list = filtered_df['length'].tolist() # syllable_list = filtered_df['syllable'].tolist() # combined_list = [(syllable_list[i], length_list[i]) for i in range(0, len(length_list))] all_sentences_list.append(length_list) util.Pickle_write(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'label_list'), all_sentences_list) return all_sentences_list
def create_hidden_transition_matrix_beta(self, observable_states, hidden_states): b_df = pd.DataFrame(columns=observable_states, index=hidden_states) total_syllable_count = len(pedecerto_df) pedecerto_df['length'] = np.where(pedecerto_df['length'] == 0, 'short', pedecerto_df['length']) pedecerto_df['length'] = np.where(pedecerto_df['length'] == 1, 'long', pedecerto_df['length']) pedecerto_df['length'] = np.where(pedecerto_df['length'] == 2, 'elision', pedecerto_df['length']) for syllable in unique_syllables: filtered_df = pedecerto_df[pedecerto_df["syllable"] == syllable] filter = filtered_df['length'].value_counts() try: b_df.at['long', syllable] = filter['long'] / total_syllable_count except: pass try: b_df.at['short', syllable] = filter['short'] / total_syllable_count except: pass try: b_df.at['elision', syllable] = filter['elision'] / total_syllable_count except: pass b_df = b_df.fillna(0) util.Pickle_write(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'hmm_b'), b_df) return b_df
def __init__(self, path): # Create pandas dataframe column_names = ["title", "line", "syllable", "length"] df = pd.DataFrame(columns=column_names) # Add all entries to process to a list entries = util.Create_files_list(path, 'xml') # Process all entries added to the list for entry in entries: with open(path + entry) as fh: # for each text, an individual dataframe will be created and saved as pickle new_text_df = copy.deepcopy(df) pickle_name = 'syllable_label_' + entry + '.pickle' # Use beautiful soup to process the xml soupedEntry = BeautifulSoup(fh, "xml") # Retrieve the title and author from the xml file text_title = str(soupedEntry.title.string) author = str(soupedEntry.author.string) # Clean the lines (done by MQDQ) soupedEntry = util.clean(soupedEntry('line')) # for line in range(len(soupedEntry)): for line in Bar('Processing {0}, {1}'.format( author, text_title)).iter(range(len(soupedEntry))): book_title = int(soupedEntry[line].parent.get('title')) # Process the entry. It will append the line to the df if not soupedEntry[line]['name'].isdigit(): continue # If our line name is not a digit, the line is uncertain. We skip over it line_df = self.Process_line(soupedEntry[line], book_title) new_text_df = new_text_df.append( line_df, ignore_index=True ) # If I greatly improve my own code, am I a wizard, or a moron? # Clean the lines that did not succeed new_text_df = self.clean_generated_df(new_text_df) util.Pickle_write(util.cf.get('Pickle', 'path'), pickle_name, new_text_df)
def __init__(self, df): # Read the config file for later use self.cf = configparser.ConfigParser() self.cf.read("config.ini") # Control flow booleans add_padding = False #True flatten_vector = False create_model = False test_model = True load_X_y = False # This functions add padding to every line if add_padding: print('Adding padding') df = self.Add_padding(df) util.Pickle_write(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'padded_set'), df) df = util.Pickle_read(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'padded_set')) if self.cf.get('Util', 'verbose'): print(df) if flatten_vector: # The network wants a single vector as input, so we flatten it for every line in the text print('Flattening the vectors') df = self.Flatten_dataframe_column(df, 'vector') util.Pickle_write(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'flattened_vectors'), df) df = util.Pickle_read(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'flattened_vectors')) if self.cf.get('Util', 'verbose'): print(df) #### # TODO: Philippe plz continue here ####) # Turn df into X and y for neural network print('Loading X and y') X, y = self.Create_X_y(df, load_X_y) # print("Training data: shape={}".format(X.shape)) # print("Training target data: shape={}".format(y.shape)) # Encode: 2 for elision and 3 for padding (0 for short, 1 for long) # y[y == 2] = 1 # Dont forget, y is numpy.ndarray # y[y == 3] = 1 if create_model: ''' README The main problem at the moment is as follows. I give a single input (ndarray), which is a line of 20 syllables, represented by vectors of dimension 25 (X.shape = 500). The output i want is 20 dimensional, one for each syllable. The output i want is multiclass: each syllable can be short, long, elided or simply padding (encoded as 0, 1, 2 or 3). The problem is therefore multiclass: each of the 20 outputs can have one and only one label. If i use binary classification and encode elision and padding as 1 as well, it seems to work quite well (see docs/first_four_lines.txt). If i use categorical classification and try to predict classes, it doesnt work as intended. Questions: can i just do multiclass prediction, or do i need to binarize my labels? Do we need scaling. Can we do multi output multi class the way i implemented it? ''' labels = ['short', 'long', 'elision', 'padding'] # TODO: do we need to scale the X data? All values are between -1 and 1. # scaler = MinMaxScaler() # X_train = scaler.fit_transform(X_train) # X_test = scaler.transform(X_test) #TODO: i have four labels i want to predict. do we need to binarize this? # mlb = MultiLabelBinarizer() # y = mlb.fit_transform(y) # one hot encode output variable (for class prediction) # y = to_categorical(y, num_classes=4) # Create and evaluate the model. Uses k-fold cross validation model = self.Evaluate_model(X, y) model.save('pickle/model') if test_model: # Load if needed. Now I just create the model every time (10 epochs) model = models.load_model('pickle/model') # TODO: i can predict using binary, but i need to predict classes. predict_classes is deprecated # However, the model.predict(X).argmax(axis=-1) results in the network predicting a single int64? # yhat = model.predict_classes(x_new) # yhat = model.predict(X).argmax(axis=-1) #model.predict_classes(X) # print([labels[i] for i in model.predict(X).argmax(axis=-1)]) # This works fine for binary classification yhat = model.predict(X) ### Uncomment this if you want to create a prediction dataframe # self.Create_prediction_df(df, X, y, yhat) # Predict and test the first 10 lines. Also, print the similarity of predicted and expected for i in range(10): print('Expected : {0}'.format(y[i])) try: # Round the number to the next whole number (for readability) round_to_whole = [round(num) for num in yhat[i]] print('Predicted: {0}'.format(round_to_whole)) res = self.Calculate_list_similarity(y[i], round_to_whole) print('Similarity score for line {0}: {1}'.format(i, res)) except: print('Predicted: %s' % yhat[i]) print('\n')
def __init__(self): # Load the pedecerto df and convert its integer labels to strings self.pedecerto_df = self.pedecerto_df_labels_to_str(self.pedecerto_df) # Create hidden state space (our labels) hidden_states = ['long', 'short', 'elision'] # create hidden transition matrix alpha # this is the transition probability matrix of changing states given a state # matrix is size (M x M) where M is number of states # create state space and initial state probabilities if self.perform_sentence_transition_list: self.label_list = self.create_sentence_transition_list( self.pedecerto_df) util.Pickle_write(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'label_list'), self.label_list) if self.perform_matrix_alpha_creation: a_df = self.create_hidden_transition_matrix_alpha( hidden_states, self.label_list) util.Pickle_write(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'hmm_a'), a_df) a_df = util.Pickle_read(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'hmm_a')) if self.cf.get('Util', 'verbose'): print(a_df) # create matrix of observation (emission) probabilities beta. # this holds the observation probabilities given state. # matrix is size (M x O) where M is number of states # and O is number of different possible observations. unique_syllables = sorted(set(self.pedecerto_df['syllable'].tolist())) observable_states = unique_syllables # Our observations are all of our unique syllables if self.perform_matrix_beta_creation: b_df = self.create_hidden_transition_matrix_beta( observable_states, hidden_states, unique_syllables, self.pedecerto_df) util.Pickle_write(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'hmm_b'), b_df) b_df = util.Pickle_read(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'hmm_b')) if self.cf.get('Util', 'verbose'): print(b_df) custom_sentence = "li to ra mul tum il le et ter ris jac ta tus et al to" custom_sentence = "ar ma vi rum que ca no troi ae qui pri mus ab or is" # Get parameters ready for the viterbi walks pi = self.get_label_probabilities(self.pedecerto_df) a = a_df.values b = b_df.values if self.perform_viterbi_walks: y_true, y_pred = self.create_y(self.pedecerto_df, observable_states, pi, a, b) util.Pickle_write(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'hmm_y_pred'), y_pred) util.Pickle_write(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'hmm_y_true'), y_true) y_true = util.Pickle_read(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'hmm_y_true')) y_pred = util.Pickle_read(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'hmm_y_pred')) self.create_prediction_df(y_true, y_pred) print('##########################################################') print(self.get_metrics_report(y_true, y_pred)) print('##########################################################')
def run_experiments(self): create_models = True crf_exp_bar_dict = { 'exp1': {}, 'exp2': {}, 'exp3': {}, 'exp4': {}, 'exp5': {} } # Experiment 1: Create model on Virgil, test on Virgil if create_models: texts = ['syllable_label_VERG-aene.xml.pickle'] crf_df = self.convert_pedecerto_to_crf_df(texts) X, y = self.convert_text_to_feature_sets(crf_df) util.Pickle_write(util.cf.get('Pickle', 'path'), 'crf_exp1_X.pickle', X) util.Pickle_write(util.cf.get('Pickle', 'path'), 'crf_exp1_y.pickle', y) else: X = util.Pickle_read(util.cf.get('Pickle', 'path'), 'crf_exp1_X.pickle') y = util.Pickle_read(util.cf.get('Pickle', 'path'), 'crf_exp1_y.pickle') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True) crf_model = self.fit_model(X_train, y_train) result = self.predict_model(crf_model, X_test, y_test) print('exp1') crf_exp_bar_dict['exp1'] = { 'short_precision': result['short']['precision'], 'short_recall': result['short']['recall'], 'long_precision': result['long']['precision'], 'long_recall': result['long']['recall'], 'elision_precision': result['elision']['precision'], 'elision_recall': result['elision']['recall'], } # Experiment 4: Test Virgil on Hercules Furens herc_df = pd.read_csv('HercFur.csv') util.Pickle_write(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'test'), herc_df) herc_df = self.convert_pedecerto_to_crf_df(['test.pickle']) X_herc, y_herc = self.convert_text_to_feature_sets(herc_df) result = self.predict_model(crf_model, X_herc, y_herc) print('exp4') crf_exp_bar_dict['exp4'] = { 'short_precision': result['short']['precision'], 'short_recall': result['short']['recall'], 'long_precision': result['long']['precision'], 'long_recall': result['long']['recall'], 'elision_precision': result['elision']['precision'], 'elision_recall': result['elision']['recall'], } # Create model on Virgil, Ovid, Iuvenal and Lucretius, test on Aeneid texts = util.Create_files_list('./pickle', 'syllable_label') crf_df = self.convert_pedecerto_to_crf_df(texts) X, y = self.convert_text_to_feature_sets(crf_df) X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True) crf_model = self.fit_model(X_train, y_train) result = self.predict_model(crf_model, X_test, y_test) print('exp2') crf_exp_bar_dict['exp2'] = { 'short_precision': result['short']['precision'], 'short_recall': result['short']['recall'], 'long_precision': result['long']['precision'], 'long_recall': result['long']['recall'], 'elision_precision': result['elision']['precision'], 'elision_recall': result['elision']['recall'], } # Create model on Virgil, Ovid, Iuvenal and Lucreatius, test on all X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True) crf_model = self.fit_model(X_train, y_train) result = self.predict_model(crf_model, X_test, y_test) print('exp3') crf_exp_bar_dict['exp3'] = { 'short_precision': result['short']['precision'], 'short_recall': result['short']['recall'], 'long_precision': result['long']['precision'], 'long_recall': result['long']['recall'], 'elision_precision': result['elision']['precision'], 'elision_recall': result['elision']['recall'], } util.Pickle_write(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'test'), crf_exp_bar_dict) # herc_df = pd.read_csv('HercFur.csv') # crf_df = self.convert_pedecerto_to_crf_df(['test.pickle']) # X, y = self.convert_text_to_feature_sets(herc_df) result = self.predict_model(crf_model, X_herc, y_herc) print('exp5') crf_exp_bar_dict['exp5'] = { 'short_precision': result['short']['precision'], 'short_recall': result['short']['recall'], 'long_precision': result['long']['precision'], 'long_recall': result['long']['recall'], 'elision_precision': result['elision']['precision'], 'elision_recall': result['elision']['recall'], } pd.DataFrame(crf_exp_bar_dict).T.plot(kind='bar') plt.legend(loc='lower left') plt.ylim([0.5, 1]) plt.savefig('./result.png') plt.show()
# Parameters to run each step run_preprocessor = False run_pedecerto = True run_model_generator = False add_embeddings_to_df = False run_neural_network = False ''' Run the preprocessor on the given text if needed. This reads the text, cleans it and returns a list of syllables for now To achieve this, the pedecerto tool is used ''' if run_preprocessor: print('Running preprocessor') preprocessor = Text_preprocessor(util.cf.get('Text', 'name')) util.Pickle_write(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'char_list'), preprocessor.character_list) # Load the preprocessed text character_list = util.Pickle_read(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'char_list')) if int(util.cf.get('Util', 'verbose')): print(character_list) ''' Now create a dataframe. Containing: syllable, length, vector. ''' if run_pedecerto: print('Running pedecerto parser') parse = Pedecerto_parser(util.cf.get('Pedecerto', 'path_texts')) # This function created pickle files for all texts that are in the ./texts/ folder pedecerto_df = util.Pickle_read(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'pedecerto_df'))
def create_hidden_transition_matrix_alpha(self, hidden_states): # Now we are going to fill the hidden transition matrix a_df = pd.DataFrame(columns=hidden_states, index=hidden_states) ll = 0 ls = 0 le = 0 sl = 0 ss = 0 se = 0 el = 0 es = 0 ee = 0 total_count = 0 for sentence in label_list: syllable_count = len(sentence) for idx, syllable in enumerate(sentence): if idx + 1 < syllable_count: item1 = sentence[idx] item2 = sentence[idx + 1] if item1 == 'long' and item2 == 'long': ll += 1 elif item1 == 'long' and item2 == 'short': ls += 1 elif item1 == 'long' and item2 == 'elision': le += 1 elif item1 == 'short' and item2 == 'long': sl += 1 elif item1 == 'short' and item2 == 'short': ss += 1 elif item1 == 'short' and item2 == 'elision': se += 1 elif item1 == 'elision' and item2 == 'long': el += 1 elif item1 == 'elision' and item2 == 'short': es += 1 elif item1 == 'elision' and item2 == 'elision': ee += 1 else: raise Exception('unknown transition found') else: break total_count += syllable_count - 1 # print(syllable_count) # exit(0) prob_ll = ll / total_count prob_ls = ls / total_count prob_le = le / total_count prob_sl = sl / total_count prob_ss = ss / total_count prob_se = se / total_count prob_el = el / total_count prob_es = es / total_count prob_ee = ee / total_count a_df.loc[hidden_states[0]] = [prob_ll, prob_ls, prob_le] a_df.loc[hidden_states[1]] = [prob_sl, prob_ss, prob_se] a_df.loc[hidden_states[2]] = [prob_el, prob_es, prob_ee] util.Pickle_write(self.cf.get('Pickle', 'path'), self.cf.get('Pickle', 'hmm_a'), a_df) return a_df