Esempio n. 1
0
    def __init__(self):

        # Read all the data we will be needing. The syllable_label_list contains a list of the used texts in [(syl, lbl), (syl,lbl), ...] format.
        crf_df = util.Pickle_read(util.cf.get('Pickle', 'path'),
                                  util.cf.get('Pickle', 'crf_df'))
        # Load training and test set: X contains syllables and their features, y contains only scansion labels per line
        X = util.Pickle_read(util.cf.get('Pickle', 'path'),
                             util.cf.get('Pickle', 'crf_X'))
        y = util.Pickle_read(util.cf.get('Pickle', 'path'),
                             util.cf.get('Pickle', 'crf_y'))
        # Load our latest CRF model
        crf_model = util.Pickle_read(util.cf.get('Pickle', 'path'),
                                     util.cf.get('Pickle', 'crf_model'))

        if self.perform_pedecerto_conversion:
            # Converts the pedecerto dataframe to a syllable_label_list as required by the used CRF suite
            texts = util.Create_files_list('./pickle', 'syllable_label')
            crf_df = self.convert_pedecerto_to_crf_df(texts)
            util.Pickle_write(util.cf.get('Pickle', 'path'),
                              util.cf.get('Pickle', 'crf_df'), crf_df)

        if self.perform_convert_text_to_feature_sets:
            # Takes the syllable label list and adds features to each syllable that are relevant for scansion
            X, y = self.convert_text_to_feature_sets(crf_df)
            util.Pickle_write(util.cf.get('Pickle', 'path'),
                              util.cf.get('Pickle', 'crf_X'), X)
            util.Pickle_write(util.cf.get('Pickle', 'path'),
                              util.cf.get('Pickle', 'crf_y'), y)

        if self.perform_fit_model:
            # Fit the model if needed
            crf_model = self.fit_model(X, y)
            self.print_crf_items(crf_model)
            util.Pickle_write(util.cf.get('Pickle', 'path'),
                              util.cf.get('Pickle', 'crf_model'), crf_model)

        if self.perform_kfold:
            # Perform kfold to check if we don't have any overfitting
            result = self.kfold_model(crf_df, X, y, 5)
            util.Pickle_write(util.cf.get('Pickle', 'path'),
                              util.cf.get('Pickle', 'crf_kfold_result'),
                              result)
            print(result)

        if self.custom_predict:
            # Predict a custom sentence. NB: this has to be syllabified by the user
            custom_sentence = "li to ra mul tum il le et ter ris iac ta tus et al to"
            custom_sentence = "ar ma vi rum que ca no troi ae qui pri mus ab or is"
            self.predict_custom_sentence(crf_model, custom_sentence)

        if self.perform_grid_search:
            # Does what it says on the tin
            self.grid_search(X, y)

        if self.perform_prediction_df:
            # Creates a simple prediction dataframe used by the frontend to quickly load results
            self.create_prediction_df(X, y)

        if self.perform_experiments:
            self.run_experiments()
Esempio n. 2
0
    def create_prediction_df(self, X, y):
        # Creates a dataframe with predictions. Used by OSCC (for now)
        df = util.Pickle_read(util.cf.get('Pickle', 'path'),
                              util.cf.get('Pickle', 'flattened_vectors'))
        crf = util.Pickle_read(util.cf.get('Pickle', 'path'),
                               util.cf.get('Pickle', 'crf_model'))

        yhat = crf.predict(X)

        column_names = ["predicted", "expected"]
        new_df = pd.DataFrame(columns=column_names)

        for i in Bar('Processing').iter(range(len(y))):
            new_line = {'expected': y[i], 'predicted': yhat[i]}
            new_df = new_df.append(new_line, ignore_index=True)

        book_line_df = df[['book', 'line', 'syllable']]

        prediction_df = pd.concat([book_line_df, new_df], axis=1, join='inner')

        print(prediction_df)

        util.Pickle_write(util.cf.get('Pickle', 'path'),
                          util.cf.get('Pickle', 'seqlab_prediction_df'),
                          prediction_df)
Esempio n. 3
0
    def convert_pedecerto_to_crf_df(self, texts) -> list:
        if int(util.cf.get('Util', 'verbose')):
            print('Creating the sentence list')
        # Find all our texts and convert them to a crf dataframe
        # This will put the entire text into a list. In this list, another list is dedicated per sentence.
        # Each sentence list has tuples consisting of a syllable and its length (label).

        # Create a list to store all texts in
        all_sentences_list = []

        for text in texts:
            df = util.Pickle_read(util.cf.get('Pickle', 'path'), text)
            # Convert the integer labels to string labels
            df = self.convert_syllable_labels(df)

            for title_index in Bar('Converting Pedecerto to CRF').iter(
                    range(df['title'].max())):
                # Get only lines from this book
                title_df = df.loc[df['title'] == title_index + 1]
                # Per book, process the lines
                for line_index in range(title_df['line'].max()):
                    line_df = title_df[title_df["line"] == line_index + 1]

                    length_list = line_df['length'].to_numpy()
                    syllable_list = line_df['syllable'].to_numpy()
                    # join them into 2d array and transpose it to get the correct crf format:
                    combined_list = np.array((syllable_list, length_list)).T
                    # Append all to the list which we will return later
                    all_sentences_list.append(combined_list)

        return all_sentences_list
Esempio n. 4
0
    def create_sentence_list(self) -> list:

        df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'pedecerto_df'))
        # Entire Aneid is put into a list. In this list, a list is dedicated per sentence.
        # Each sentence list has tuples consisting of a syllable and its length.

        # Convert the labels from int to str
        df['length'] = np.where(df['length'] == 0, 'short', df['length'])
        df['length'] = np.where(df['length'] == 1, 'long', df['length'])
        df['length'] = np.where(df['length'] == 2, 'elision', df['length'])

        all_sentences_list = []

        # Get number of books to process
        num_books = df['book'].max()

        # for i in range(num_books):
        for i in Bar('Processing').iter(range(num_books)):
            # Get only lines from this book
            current_book = i + 1
            book_df = df.loc[df['book'] == current_book]

            num_lines = book_df['line'].max()

            for j in range(num_lines):
                current_line = j + 1

                filtered_df = book_df[book_df["line"] == current_line]

                length_list = filtered_df['length'].tolist()
                # syllable_list = filtered_df['syllable'].tolist()

                # combined_list = [(syllable_list[i], length_list[i]) for i in range(0, len(length_list))]

                all_sentences_list.append(length_list)

        util.Pickle_write(self.cf.get('Pickle', 'path'),
                          self.cf.get('Pickle', 'label_list'),
                          all_sentences_list)

        return all_sentences_list
Esempio n. 5
0
def convert_pedecerto_to_crf_array() -> list:

    if int(util.cf.get('Util', 'verbose')): print('Creating the sentence list')

    df = util.Pickle_read(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'pedecerto_df'))
    # Entire Aneid is put into a list. In this list, a list is dedicated per sentence.
    # Each sentence list has tuples consisting of a syllable and its length.

    # Convert the labels from int to str
    df['length'] = np.where(df['length'] == 0, 'short', df['length'])
    df['length'] = np.where(df['length'] == 1, 'long', df['length'])
    df['length'] = np.where(df['length'] == 2, 'elision', df['length'])

    all_sentences_list = []

    # Get number of books to process
    num_books = df['book'].max()

    # for i in range(num_books):
    for i in Bar('Processing').iter(range(num_books)):
        # Get only lines from this book
        current_book = i + 1
        book_df = df.loc[df['book'] == current_book]

        num_lines = book_df['line'].max()

        for j in range(num_lines):
            current_line = j + 1

            filtered_df = book_df[book_df["line"] == current_line]

            # length_list = filtered_df['length'].to_numpy()
            syllable_list = filtered_df['syllable'].to_numpy()
            # join them into 2d array and transpose it to get the correct crf format:
            # combined_list = np.array((syllable_list,length_list)).T
            # Append all to the list which we will return later
            all_sentences_list.append(syllable_list)

    return all_sentences_list
Esempio n. 6
0
class Base_line:

    cf = configparser.ConfigParser()
    cf.read("config.ini")

    y_true = util.Pickle_read(cf.get('Pickle', 'path'),
                              cf.get('Pickle', 'hmm_y_true'))

    def __init__(self):

        y_pred = []  # fill this with long
        for sentence in self.y_true:
            y_pred.append(['long'] * len(sentence))

        print(self.get_metrics_report(self.y_true, y_pred))

    def get_metrics_report(self, y_true, y_pred):
        sorted_labels = sorted(['long', 'short', 'elision'],
                               key=lambda name: (name[1:], name[0]))
        metrics_report = metrics.flat_classification_report(
            y_true, y_pred, labels=sorted_labels, digits=3)
        return metrics_report
Esempio n. 7
0
def Get_neural_data():
    cf = configparser.ConfigParser()
    cf.read("config.ini")

    line_number = int(request.args.get('line_number'))
    book_number = int(request.args.get('book_number'))

    # with open('./pickle/X.npy', 'rb') as f:
    # X = np.load(f, allow_pickle=True)
    # with open('./pickle/y.npy', 'rb') as f:
    # y = np.load(f, allow_pickle=True)

    # model = models.load_model('./pickle/model')

    # # This works fine for binary classification
    # yhat = model.predict(X)

    # # Predict and test the first 10 lines. Also, print the similarity of predicted and expected
    # expected = y[line_number-1]

    df_nn = util.Pickle_read(cf.get('Pickle', 'path'),
                             cf.get('Pickle', 'prediction_df'))
    df_hmm = util.Pickle_read(cf.get('Pickle', 'path'),
                              cf.get('Pickle', 'hmm_prediction_df'))
    df_seqlab = util.Pickle_read(cf.get('Pickle', 'path'),
                                 cf.get('Pickle', 'seqlab_prediction_df'))

    df_expected = util.Pickle_read(cf.get('Pickle', 'path'),
                                   cf.get('Pickle', 'flattened_vectors'))

    df_expected_filtered = df_expected[(df_expected['book'] == book_number) & (
        df_expected['line'] == line_number)].reset_index()
    df_nn_filtered = df_nn[(df_nn['book'] == book_number)
                           & (df_nn['line'] == line_number)].reset_index()
    df_hmm_filtered = df_hmm[(df_hmm['book'] == book_number)
                             & (df_hmm['line'] == line_number)].reset_index()
    df_seqlab_filtered = df_seqlab[(df_seqlab['book'] == book_number) & (
        df_seqlab['line'] == line_number)].reset_index()

    # Overall information
    syllables = df_nn_filtered['syllable'][0]
    syllables = [i for i in syllables if i != 0]  # Trim padding

    expected = df_expected_filtered['length'][0]
    expected = expected[:len(syllables)]  # Trim padding

    # Neural network prediction
    predicted_nn = df_nn_filtered['predicted'][0]
    predicted_nn = predicted_nn[:len(syllables)]  # Trim padding
    predicted_nn_int = [round(num) for num in predicted_nn]

    labels_nn_predicted = [
        '—' if i == 1 else '⏑' if i == 0 else i for i in predicted_nn_int
    ]
    labels_expected = [
        '—' if i == 1 else '⏑' if i == 0 else '∅' for i in expected
    ]

    print(expected, '\n', labels_expected)

    predicted_hmm = df_hmm_filtered['predicted'][0]
    labels_hmm_predicted = [
        '—' if i == 'long' else '⏑' if i == 'short' else '∅'
        for i in predicted_hmm
    ]

    predicted_seqlab = df_seqlab_filtered['predicted'][0]
    labels_seqlab_predicted = [
        '—' if i == 'long' else '⏑' if i == 'short' else '∅'
        for i in predicted_seqlab
    ]

    correct_list_nn = calculate_list_similarity(labels_expected,
                                                labels_nn_predicted)
    correct_list_hmm = calculate_list_similarity(labels_expected,
                                                 labels_hmm_predicted)
    correct_list_seqlab = calculate_list_similarity(labels_expected,
                                                    labels_seqlab_predicted)

    # Dirty hack for Angular
    labels_hmm_predicted.append('HMM')
    labels_nn_predicted.append('NN')
    labels_seqlab_predicted.append('SeqLab')
    labels_expected.append('Expected')
    syllables.append('Syllables')

    result = {
        "syllables": syllables,
        "expected": list(labels_expected),
        "nn_predicted": list(labels_nn_predicted),
        "hmm_predicted": list(labels_hmm_predicted),
        "seqlab_predicted": list(labels_seqlab_predicted),
        "correct_list_nn": correct_list_nn,
        "correct_list_hmm": correct_list_hmm,
        "correct_list_seqlab": correct_list_seqlab,
        "length": len(syllables),
    }

    return jsonify(result)
Esempio n. 8
0
        for j in range(num_lines):
            current_line = j + 1

            filtered_df = book_df[book_df["line"] == current_line]

            # length_list = filtered_df['length'].to_numpy()
            syllable_list = filtered_df['syllable'].to_numpy()
            # join them into 2d array and transpose it to get the correct crf format:
            # combined_list = np.array((syllable_list,length_list)).T
            # Append all to the list which we will return later
            all_sentences_list.append(syllable_list)

    return all_sentences_list

pedecerto_df = util.Pickle_read(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'pedecerto_df'))

# array = convert_pedecerto_to_crf_array()
# util.Pickle_write(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'test'), array)
array = util.Pickle_read(util.cf.get('Pickle', 'path'), util.cf.get('Pickle', 'test'))

text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])

vocab_data = [["luuk", "frank", "kaas", "pizza"],["earth", "wind", "and", "fire"]]

encoder = tf.keras.layers.TextVectorization(vocabulary=vocab_data)
# Now call an adapt for every item in the dataset
# encoder.adapt(text_dataset.batch(64))


exit(0)
Esempio n. 9
0
    def __init__(self, df):
        # Read the config file for later use
        self.cf = configparser.ConfigParser()
        self.cf.read("config.ini")

        # Control flow booleans
        add_padding = False  #True
        flatten_vector = False
        create_model = False
        test_model = True

        load_X_y = False

        # This functions add padding to every line
        if add_padding:
            print('Adding padding')
            df = self.Add_padding(df)
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'padded_set'), df)

        df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'padded_set'))
        if self.cf.get('Util', 'verbose'): print(df)

        if flatten_vector:
            # The network wants a single vector as input, so we flatten it for every line in the text
            print('Flattening the vectors')
            df = self.Flatten_dataframe_column(df, 'vector')
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'flattened_vectors'), df)
        df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'flattened_vectors'))
        if self.cf.get('Util', 'verbose'): print(df)

        ####
        # TODO: Philippe plz continue here
        ####)

        # Turn df into X and y for neural network
        print('Loading X and y')
        X, y = self.Create_X_y(df, load_X_y)
        # print("Training data: shape={}".format(X.shape))
        # print("Training target data: shape={}".format(y.shape))

        # Encode: 2 for elision and 3 for padding (0 for short, 1 for long)
        # y[y == 2] = 1 # Dont forget, y is numpy.ndarray
        # y[y == 3] = 1

        if create_model:
            ''' 
            README
            The main problem at the moment is as follows. I give a single input (ndarray), which is a line of 20 syllables, represented 
            by vectors of dimension 25 (X.shape = 500). The output i want is 20 dimensional, one for each syllable. The output i want is
            multiclass: each syllable can be short, long, elided or simply padding (encoded as 0, 1, 2 or 3). The problem is therefore 
            multiclass: each of the 20 outputs can have one and only one label. If i use binary classification and encode elision and padding
            as 1 as well, it seems to work quite well (see docs/first_four_lines.txt). If i use categorical classification and try to predict
            classes, it doesnt work as intended. Questions: can i just do multiclass prediction, or do i need to binarize my labels? Do we
            need scaling. Can we do multi output multi class the way i implemented it?
            '''

            labels = ['short', 'long', 'elision', 'padding']

            # TODO: do we need to scale the X data? All values are between -1 and 1.
            # scaler = MinMaxScaler()
            # X_train = scaler.fit_transform(X_train)
            # X_test = scaler.transform(X_test)

            #TODO: i have four labels i want to predict. do we need to binarize this?
            # mlb = MultiLabelBinarizer()
            # y = mlb.fit_transform(y)

            # one hot encode output variable (for class prediction)
            # y = to_categorical(y, num_classes=4)

            # Create and evaluate the model. Uses k-fold cross validation
            model = self.Evaluate_model(X, y)

            model.save('pickle/model')

        if test_model:
            # Load if needed. Now I just create the model every time (10 epochs)
            model = models.load_model('pickle/model')

            # TODO: i can predict using binary, but i need to predict classes. predict_classes is deprecated
            # However, the model.predict(X).argmax(axis=-1) results in the network predicting a single int64?
            # yhat = model.predict_classes(x_new)
            # yhat = model.predict(X).argmax(axis=-1) #model.predict_classes(X)
            # print([labels[i] for i in model.predict(X).argmax(axis=-1)])

            # This works fine for binary classification
            yhat = model.predict(X)

            ### Uncomment this if you want to create a prediction dataframe
            # self.Create_prediction_df(df, X, y, yhat)

            # Predict and test the first 10 lines. Also, print the similarity of predicted and expected
            for i in range(10):

                print('Expected : {0}'.format(y[i]))

                try:
                    # Round the number to the next whole number (for readability)
                    round_to_whole = [round(num) for num in yhat[i]]
                    print('Predicted: {0}'.format(round_to_whole))
                    res = self.Calculate_list_similarity(y[i], round_to_whole)
                    print('Similarity score for line {0}: {1}'.format(i, res))
                except:
                    print('Predicted: %s' % yhat[i])

                print('\n')
Esempio n. 10
0
    def __init__(self):

        # Load the pedecerto df and convert its integer labels to strings
        self.pedecerto_df = self.pedecerto_df_labels_to_str(self.pedecerto_df)

        # Create hidden state space (our labels)
        hidden_states = ['long', 'short', 'elision']

        # create hidden transition matrix alpha
        # this is the transition probability matrix of changing states given a state
        # matrix is size (M x M) where M is number of states

        # create state space and initial state probabilities
        if self.perform_sentence_transition_list:
            self.label_list = self.create_sentence_transition_list(
                self.pedecerto_df)
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'label_list'),
                              self.label_list)

        if self.perform_matrix_alpha_creation:
            a_df = self.create_hidden_transition_matrix_alpha(
                hidden_states, self.label_list)
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'hmm_a'), a_df)

        a_df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                                self.cf.get('Pickle', 'hmm_a'))
        if self.cf.get('Util', 'verbose'): print(a_df)

        # create matrix of observation (emission) probabilities beta.
        # this holds the observation probabilities given state.
        # matrix is size (M x O) where M is number of states
        # and O is number of different possible observations.
        unique_syllables = sorted(set(self.pedecerto_df['syllable'].tolist()))
        observable_states = unique_syllables  # Our observations are all of our unique syllables

        if self.perform_matrix_beta_creation:
            b_df = self.create_hidden_transition_matrix_beta(
                observable_states, hidden_states, unique_syllables,
                self.pedecerto_df)
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'hmm_b'), b_df)

        b_df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                                self.cf.get('Pickle', 'hmm_b'))
        if self.cf.get('Util', 'verbose'): print(b_df)

        custom_sentence = "li to ra mul tum il le et ter ris jac ta tus et al to"
        custom_sentence = "ar ma vi rum que ca no troi ae qui pri mus ab or is"

        # Get parameters ready for the viterbi walks
        pi = self.get_label_probabilities(self.pedecerto_df)
        a = a_df.values
        b = b_df.values

        if self.perform_viterbi_walks:
            y_true, y_pred = self.create_y(self.pedecerto_df,
                                           observable_states, pi, a, b)
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'hmm_y_pred'), y_pred)
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'hmm_y_true'), y_true)

        y_true = util.Pickle_read(self.cf.get('Pickle', 'path'),
                                  self.cf.get('Pickle', 'hmm_y_true'))
        y_pred = util.Pickle_read(self.cf.get('Pickle', 'path'),
                                  self.cf.get('Pickle', 'hmm_y_pred'))

        self.create_prediction_df(y_true, y_pred)

        print('##########################################################')
        print(self.get_metrics_report(y_true, y_pred))
        print('##########################################################')
Esempio n. 11
0
class Hidden_markov_model:

    cf = configparser.ConfigParser()
    cf.read("config.ini")

    perform_sentence_transition_list = False
    perform_matrix_alpha_creation = False
    perform_matrix_beta_creation = False
    perform_viterbi_walks = False

    pedecerto_df = util.Pickle_read(cf.get('Pickle', 'path'),
                                    cf.get('Pickle', 'pedecerto_df'))
    label_list = util.Pickle_read(cf.get('Pickle', 'path'),
                                  cf.get('Pickle', 'label_list'))

    def __init__(self):

        # Load the pedecerto df and convert its integer labels to strings
        self.pedecerto_df = self.pedecerto_df_labels_to_str(self.pedecerto_df)

        # Create hidden state space (our labels)
        hidden_states = ['long', 'short', 'elision']

        # create hidden transition matrix alpha
        # this is the transition probability matrix of changing states given a state
        # matrix is size (M x M) where M is number of states

        # create state space and initial state probabilities
        if self.perform_sentence_transition_list:
            self.label_list = self.create_sentence_transition_list(
                self.pedecerto_df)
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'label_list'),
                              self.label_list)

        if self.perform_matrix_alpha_creation:
            a_df = self.create_hidden_transition_matrix_alpha(
                hidden_states, self.label_list)
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'hmm_a'), a_df)

        a_df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                                self.cf.get('Pickle', 'hmm_a'))
        if self.cf.get('Util', 'verbose'): print(a_df)

        # create matrix of observation (emission) probabilities beta.
        # this holds the observation probabilities given state.
        # matrix is size (M x O) where M is number of states
        # and O is number of different possible observations.
        unique_syllables = sorted(set(self.pedecerto_df['syllable'].tolist()))
        observable_states = unique_syllables  # Our observations are all of our unique syllables

        if self.perform_matrix_beta_creation:
            b_df = self.create_hidden_transition_matrix_beta(
                observable_states, hidden_states, unique_syllables,
                self.pedecerto_df)
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'hmm_b'), b_df)

        b_df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                                self.cf.get('Pickle', 'hmm_b'))
        if self.cf.get('Util', 'verbose'): print(b_df)

        custom_sentence = "li to ra mul tum il le et ter ris jac ta tus et al to"
        custom_sentence = "ar ma vi rum que ca no troi ae qui pri mus ab or is"

        # Get parameters ready for the viterbi walks
        pi = self.get_label_probabilities(self.pedecerto_df)
        a = a_df.values
        b = b_df.values

        if self.perform_viterbi_walks:
            y_true, y_pred = self.create_y(self.pedecerto_df,
                                           observable_states, pi, a, b)
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'hmm_y_pred'), y_pred)
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'hmm_y_true'), y_true)

        y_true = util.Pickle_read(self.cf.get('Pickle', 'path'),
                                  self.cf.get('Pickle', 'hmm_y_true'))
        y_pred = util.Pickle_read(self.cf.get('Pickle', 'path'),
                                  self.cf.get('Pickle', 'hmm_y_pred'))

        self.create_prediction_df(y_true, y_pred)

        print('##########################################################')
        print(self.get_metrics_report(y_true, y_pred))
        print('##########################################################')

    def create_prediction_df(self, df, y, yhat):
        df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'flattened_vectors'))
        # Creates a dataframe with predictions. Used by OSCC (for now)
        column_names = ["predicted", "expected"]
        new_df = pd.DataFrame(columns=column_names)

        for i in Bar('Processing').iter(range(len(y))):
            new_line = {'expected': y[i], 'predicted': yhat[i]}
            new_df = new_df.append(new_line, ignore_index=True)

        book_line_df = df[['book', 'line', 'syllable']]

        prediction_df = pd.concat([book_line_df, new_df], axis=1, join='inner')

        print(prediction_df)

        util.Pickle_write(self.cf.get('Pickle', 'path'),
                          self.cf.get('Pickle', 'hmm_prediction_df'),
                          prediction_df)

    def create_y(self, pedecerto_df, observable_states, pi, a, b):
        y_pred = []
        y_true = []
        # Get number of books to process
        num_books = pedecerto_df['book'].max()
        for i in Bar('Processing').iter(range(num_books)):
            # Get only lines from this book
            current_book = i + 1
            book_df = pedecerto_df.loc[pedecerto_df['book'] == current_book]

            num_lines = book_df['line'].max()

            for j in range(num_lines):  # now process all lines using the HMM
                current_line = j + 1
                filtered_df = book_df[book_df["line"] == current_line]

                true_label_list = filtered_df['length'].tolist()
                syllable_list = filtered_df['syllable'].tolist()

                if syllable_list:  #FIXME: this is bad
                    result = self.predict_single_sentence(
                        syllable_list, observable_states, pi, a, b)
                    y_pred.append(result['Best_Path'].tolist())
                    y_true.append(true_label_list)

        return y_true, y_pred

    def get_metrics_report(self, y_true, y_pred):
        sorted_labels = sorted(['long', 'short', 'elision'],
                               key=lambda name: (name[1:], name[0]))
        metrics_report = metrics.flat_classification_report(
            y_true, y_pred, labels=sorted_labels, digits=3)
        return metrics_report

    def predict_single_sentence(self, syllable_list, observable_states, pi, a,
                                b):
        debug_mode = False

        sentence_array = np.array([])

        for syllable in syllable_list:  # Convert syllable list to observable_state list (using its indeces)
            sentence_array = np.append(sentence_array,
                                       observable_states.index(syllable))

        obs = sentence_array.astype(int)

        obs_map = {}
        for state in observable_states:
            obs_map[state] = observable_states.index(state)

        inv_obs_map = dict((v, k) for k, v in obs_map.items())
        obs_seq = [inv_obs_map[v] for v in list(obs)]

        # Sequence of overservations (and their code)
        if debug_mode:
            print(
                pd.DataFrame(np.column_stack([obs, obs_seq]),
                             columns=['Obs_code', 'Obs_seq']))

        # Do the viterbi walk
        path, delta, phi = self.viterbi(pi, a, b, obs)

        if debug_mode:
            print('\nsingle best state path: \n', path)
            print('delta:\n', delta)
            print('phi:\n', phi)

        state_map = {0: 'long', 1: 'short', 2: 'elision'}
        state_path = [state_map[v] for v in path]

        result = ((pd.DataFrame().assign(Observation=obs_seq).assign(
            Best_Path=state_path)))

        if debug_mode: print(result)

        return result

    def pedecerto_df_labels_to_str(self, df):
        df['length'] = np.where(df['length'] == 0, 'short', df['length'])
        df['length'] = np.where(df['length'] == 1, 'long', df['length'])
        df['length'] = np.where(df['length'] == 2, 'elision', df['length'])
        return df

    # define Viterbi algorithm for shortest path
    # code adapted from Stephen Marsland's, Machine Learning An Algorthmic Perspective, Vol. 2
    # https://github.com/alexsosn/MarslandMLAlgo/blob/master/Ch16/HMM.py
    def viterbi(self, pi, a, b, obs):

        debug_mode = False

        nStates = np.shape(b)[0]
        T = np.shape(obs)[0]

        # init blank path
        path = path = np.zeros(T, dtype=int)
        # delta --> highest probability of any path that reaches state i
        delta = np.zeros((nStates, T))
        # phi --> argmax by time step for each state
        phi = np.zeros((nStates, T))

        # init delta and phi
        delta[:, 0] = pi * b[:, obs[0]]
        phi[:, 0] = 0

        if debug_mode: print('\nStart Walk Forward\n')
        # the forward algorithm extension
        for t in range(1, T):
            for s in range(nStates):
                delta[s, t] = np.max(delta[:, t - 1] * a[:, s]) * b[s, obs[t]]
                phi[s, t] = np.argmax(delta[:, t - 1] * a[:, s])
                if debug_mode:
                    print('s={s} and t={t}: phi[{s}, {t}] = {phi}'.format(
                        s=s, t=t, phi=phi[s, t]))

        # find optimal path
        if debug_mode: print('-' * 50)
        if debug_mode: print('Start Backtrace\n')
        path[T - 1] = np.argmax(delta[:, T - 1])
        for t in range(T - 2, -1, -1):
            path[t] = phi[path[t + 1], [t + 1]]
            if debug_mode: print('path[{}] = {}'.format(t, path[t]))

        return path, delta, phi

    def create_sentence_transition_list(self, df) -> list:

        all_sentences_list = []
        # Get number of books to process
        num_books = df['book'].max()
        for i in Bar('Processing').iter(range(num_books)):
            # Get only lines from this book
            current_book = i + 1
            book_df = df.loc[df['book'] == current_book]

            num_lines = book_df['line'].max()

            for j in range(num_lines):
                current_line = j + 1
                filtered_df = book_df[book_df["line"] == current_line]
                length_list = filtered_df['length'].tolist()
                all_sentences_list.append(length_list)

        return all_sentences_list

    def create_hidden_transition_matrix_alpha(self, hidden_states, label_list):
        # Now we are going to fill the hidden transition matrix
        a_df = pd.DataFrame(columns=hidden_states, index=hidden_states)

        ll = 0
        ls = 0
        le = 0
        sl = 0
        ss = 0
        se = 0
        el = 0
        es = 0
        ee = 0

        total_count = 0

        for sentence in label_list:

            syllable_count = len(sentence)

            for idx, syllable in enumerate(sentence):

                if idx + 1 < syllable_count:

                    item1 = sentence[idx]
                    item2 = sentence[idx + 1]

                    if item1 == 'long' and item2 == 'long': ll += 1
                    elif item1 == 'long' and item2 == 'short': ls += 1
                    elif item1 == 'long' and item2 == 'elision': le += 1
                    elif item1 == 'short' and item2 == 'long': sl += 1
                    elif item1 == 'short' and item2 == 'short': ss += 1
                    elif item1 == 'short' and item2 == 'elision': se += 1
                    elif item1 == 'elision' and item2 == 'long': el += 1
                    elif item1 == 'elision' and item2 == 'short': es += 1
                    elif item1 == 'elision' and item2 == 'elision': ee += 1
                    else:
                        raise Exception('unknown transition found')

                else:
                    break

            total_count += syllable_count - 1

            # print(syllable_count)
            # exit(0)

        prob_ll = ll / total_count
        prob_ls = ls / total_count
        prob_le = le / total_count
        prob_sl = sl / total_count
        prob_ss = ss / total_count
        prob_se = se / total_count
        prob_el = el / total_count
        prob_es = es / total_count
        prob_ee = ee / total_count

        a_df.loc[hidden_states[0]] = [prob_ll, prob_ls, prob_le]
        a_df.loc[hidden_states[1]] = [prob_sl, prob_ss, prob_se]
        a_df.loc[hidden_states[2]] = [prob_el, prob_es, prob_ee]

        return a_df

    def create_hidden_transition_matrix_beta(self, observable_states,
                                             hidden_states, unique_syllables,
                                             pedecerto_df):

        b_df = pd.DataFrame(columns=observable_states, index=hidden_states)

        total_syllable_count = len(pedecerto_df)

        for syllable in unique_syllables:
            filtered_df = pedecerto_df[pedecerto_df["syllable"] == syllable]

            filter = filtered_df['length'].value_counts()

            try:
                b_df.at['long',
                        syllable] = filter['long'] / total_syllable_count
            except:
                pass
            try:
                b_df.at['short',
                        syllable] = filter['short'] / total_syllable_count
            except:
                pass

            try:
                b_df.at['elision',
                        syllable] = filter['elision'] / total_syllable_count
            except:
                pass

        b_df = b_df.fillna(0)

        return b_df

    def get_label_probabilities(self, pedecerto_df):

        filter = pedecerto_df['length'].value_counts()

        long = filter['long'] / len(pedecerto_df)
        short = filter['short'] / len(pedecerto_df)
        elision = filter['elision'] / len(pedecerto_df)

        return [long, short, elision]
Esempio n. 12
0
    def run_experiments(self):
        create_models = True

        crf_exp_bar_dict = {
            'exp1': {},
            'exp2': {},
            'exp3': {},
            'exp4': {},
            'exp5': {}
        }

        # Experiment 1: Create model on Virgil, test on Virgil
        if create_models:
            texts = ['syllable_label_VERG-aene.xml.pickle']
            crf_df = self.convert_pedecerto_to_crf_df(texts)
            X, y = self.convert_text_to_feature_sets(crf_df)
            util.Pickle_write(util.cf.get('Pickle', 'path'),
                              'crf_exp1_X.pickle', X)
            util.Pickle_write(util.cf.get('Pickle', 'path'),
                              'crf_exp1_y.pickle', y)
        else:
            X = util.Pickle_read(util.cf.get('Pickle', 'path'),
                                 'crf_exp1_X.pickle')
            y = util.Pickle_read(util.cf.get('Pickle', 'path'),
                                 'crf_exp1_y.pickle')

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42,
                                                            shuffle=True)
        crf_model = self.fit_model(X_train, y_train)
        result = self.predict_model(crf_model, X_test, y_test)
        print('exp1')

        crf_exp_bar_dict['exp1'] = {
            'short_precision': result['short']['precision'],
            'short_recall': result['short']['recall'],
            'long_precision': result['long']['precision'],
            'long_recall': result['long']['recall'],
            'elision_precision': result['elision']['precision'],
            'elision_recall': result['elision']['recall'],
        }

        # Experiment 4: Test Virgil on Hercules Furens
        herc_df = pd.read_csv('HercFur.csv')
        util.Pickle_write(util.cf.get('Pickle', 'path'),
                          util.cf.get('Pickle', 'test'), herc_df)
        herc_df = self.convert_pedecerto_to_crf_df(['test.pickle'])
        X_herc, y_herc = self.convert_text_to_feature_sets(herc_df)
        result = self.predict_model(crf_model, X_herc, y_herc)
        print('exp4')

        crf_exp_bar_dict['exp4'] = {
            'short_precision': result['short']['precision'],
            'short_recall': result['short']['recall'],
            'long_precision': result['long']['precision'],
            'long_recall': result['long']['recall'],
            'elision_precision': result['elision']['precision'],
            'elision_recall': result['elision']['recall'],
        }

        # Create model on Virgil, Ovid, Iuvenal and Lucretius, test on Aeneid
        texts = util.Create_files_list('./pickle', 'syllable_label')
        crf_df = self.convert_pedecerto_to_crf_df(texts)
        X, y = self.convert_text_to_feature_sets(crf_df)
        X_train, _, y_train, _ = train_test_split(X,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  shuffle=True)
        crf_model = self.fit_model(X_train, y_train)
        result = self.predict_model(crf_model, X_test, y_test)
        print('exp2')

        crf_exp_bar_dict['exp2'] = {
            'short_precision': result['short']['precision'],
            'short_recall': result['short']['recall'],
            'long_precision': result['long']['precision'],
            'long_recall': result['long']['recall'],
            'elision_precision': result['elision']['precision'],
            'elision_recall': result['elision']['recall'],
        }

        # Create model on Virgil, Ovid, Iuvenal and Lucreatius, test on all
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42,
                                                            shuffle=True)
        crf_model = self.fit_model(X_train, y_train)
        result = self.predict_model(crf_model, X_test, y_test)
        print('exp3')

        crf_exp_bar_dict['exp3'] = {
            'short_precision': result['short']['precision'],
            'short_recall': result['short']['recall'],
            'long_precision': result['long']['precision'],
            'long_recall': result['long']['recall'],
            'elision_precision': result['elision']['precision'],
            'elision_recall': result['elision']['recall'],
        }

        util.Pickle_write(util.cf.get('Pickle', 'path'),
                          util.cf.get('Pickle', 'test'), crf_exp_bar_dict)

        # herc_df = pd.read_csv('HercFur.csv')
        # crf_df = self.convert_pedecerto_to_crf_df(['test.pickle'])
        # X, y = self.convert_text_to_feature_sets(herc_df)
        result = self.predict_model(crf_model, X_herc, y_herc)
        print('exp5')
        crf_exp_bar_dict['exp5'] = {
            'short_precision': result['short']['precision'],
            'short_recall': result['short']['recall'],
            'long_precision': result['long']['precision'],
            'long_recall': result['long']['recall'],
            'elision_precision': result['elision']['precision'],
            'elision_recall': result['elision']['recall'],
        }

        pd.DataFrame(crf_exp_bar_dict).T.plot(kind='bar')
        plt.legend(loc='lower left')
        plt.ylim([0.5, 1])
        plt.savefig('./result.png')
        plt.show()
Esempio n. 13
0
run_model_generator = False
add_embeddings_to_df = False
run_neural_network = False
''' Run the preprocessor on the given text if needed.
This reads the text, cleans it and returns a list of syllables for now
To achieve this, the pedecerto tool is used
'''
if run_preprocessor:
    print('Running preprocessor')
    preprocessor = Text_preprocessor(util.cf.get('Text', 'name'))
    util.Pickle_write(util.cf.get('Pickle', 'path'),
                      util.cf.get('Pickle', 'char_list'),
                      preprocessor.character_list)

# Load the preprocessed text
character_list = util.Pickle_read(util.cf.get('Pickle', 'path'),
                                  util.cf.get('Pickle', 'char_list'))
if int(util.cf.get('Util', 'verbose')): print(character_list)
''' Now create a dataframe. Containing: syllable, length, vector.
'''
if run_pedecerto:
    print('Running pedecerto parser')
    parse = Pedecerto_parser(util.cf.get('Pedecerto', 'path_texts'))
    # This function created pickle files for all texts that are in the ./texts/ folder

pedecerto_df = util.Pickle_read(util.cf.get('Pickle', 'path'),
                                util.cf.get('Pickle', 'pedecerto_df'))
if util.cf.get('Util', 'verbose'): print(pedecerto_df)

# Run the model generator on the given list if needed
if run_model_generator:
    print('Running Word2Vec model generator')
Esempio n. 14
0
    def __init__(self):

        self.pedecerto_df = self.pedecerto_df_labels_to_str(self.pedecerto_df)

        # create state space and initial state probabilities
        # label_list = create_sentence_list()

        # create state space and initial state probabilities
        hidden_states = ['long', 'short', 'elision']

        # create hidden transition matrix
        # a or alpha
        #   = transition probability matrix of changing states given a state
        # matrix is size (M x M) where M is number of states
        # a_df = create_hidden_transition_matrix_alpha(hidden_states)
        a_df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                                self.cf.get('Pickle', 'hmm_a'))
        print(a_df)

        # create matrix of observation (emission) probabilities
        # b or beta = observation probabilities given state
        # matrix is size (M x O) where M is number of states
        # and O is number of different possible observations
        unique_syllables = sorted(set(pedecerto_df['syllable'].tolist()))
        observable_states = unique_syllables

        # b_df = create_hidden_transition_matrix_beta(observable_states, hidden_states)
        b_df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                                self.cf.get('Pickle', 'hmm_b'))
        print(b_df)

        # observation sequence of dog's behaviors
        # observations are encoded numerically
        custom_sentence = "ar ma vi rum que ca no troi ae qui pri mus ab or is"
        custom_sentence = "li to ra mul tum il le et ter ris jac ta tus et al to"

        sentence_array = np.array([])

        for syllable in custom_sentence.split():
            sentence_array = np.append(sentence_array,
                                       observable_states.index(syllable))

        obs = sentence_array.astype(int)

        obs_map = {}
        for state in observable_states:
            # print(state, observable_states.index(state))
            obs_map[state] = observable_states.index(state)

        inv_obs_map = dict((v, k) for k, v in obs_map.items())
        obs_seq = [inv_obs_map[v] for v in list(obs)]

        # Sequence of overservations (and their code)
        print(
            pd.DataFrame(np.column_stack([obs, obs_seq]),
                         columns=['Obs_code', 'Obs_seq']))

        pi = get_label_probabilities()

        a = a_df.values
        b = b_df.values

        path, delta, phi = self.viterbi(pi, a, b, obs)
        # print('\nsingle best state path: \n', path)
        # print('delta:\n', delta)
        # print('phi:\n', phi)

        # exit(0)

        state_map = {0: 'long', 1: 'short', 2: 'elision'}
        state_path = [state_map[v] for v in path]

        print((pd.DataFrame().assign(Observation=obs_seq).assign(
            Best_Path=state_path)))
Esempio n. 15
0
class Hidden_markov_model:

    cf = configparser.ConfigParser()
    cf.read("config.ini")

    pedecerto_df = util.Pickle_read(cf.get('Pickle', 'path'),
                                    cf.get('Pickle', 'pedecerto_df'))
    label_list = util.Pickle_read(cf.get('Pickle', 'path'),
                                  cf.get('Pickle', 'label_list'))

    def __init__(self):

        self.pedecerto_df = self.pedecerto_df_labels_to_str(self.pedecerto_df)

        # create state space and initial state probabilities
        # label_list = create_sentence_list()

        # create state space and initial state probabilities
        hidden_states = ['long', 'short', 'elision']

        # create hidden transition matrix
        # a or alpha
        #   = transition probability matrix of changing states given a state
        # matrix is size (M x M) where M is number of states
        # a_df = create_hidden_transition_matrix_alpha(hidden_states)
        a_df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                                self.cf.get('Pickle', 'hmm_a'))
        print(a_df)

        # create matrix of observation (emission) probabilities
        # b or beta = observation probabilities given state
        # matrix is size (M x O) where M is number of states
        # and O is number of different possible observations
        unique_syllables = sorted(set(pedecerto_df['syllable'].tolist()))
        observable_states = unique_syllables

        # b_df = create_hidden_transition_matrix_beta(observable_states, hidden_states)
        b_df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                                self.cf.get('Pickle', 'hmm_b'))
        print(b_df)

        # observation sequence of dog's behaviors
        # observations are encoded numerically
        custom_sentence = "ar ma vi rum que ca no troi ae qui pri mus ab or is"
        custom_sentence = "li to ra mul tum il le et ter ris jac ta tus et al to"

        sentence_array = np.array([])

        for syllable in custom_sentence.split():
            sentence_array = np.append(sentence_array,
                                       observable_states.index(syllable))

        obs = sentence_array.astype(int)

        obs_map = {}
        for state in observable_states:
            # print(state, observable_states.index(state))
            obs_map[state] = observable_states.index(state)

        inv_obs_map = dict((v, k) for k, v in obs_map.items())
        obs_seq = [inv_obs_map[v] for v in list(obs)]

        # Sequence of overservations (and their code)
        print(
            pd.DataFrame(np.column_stack([obs, obs_seq]),
                         columns=['Obs_code', 'Obs_seq']))

        pi = get_label_probabilities()

        a = a_df.values
        b = b_df.values

        path, delta, phi = self.viterbi(pi, a, b, obs)
        # print('\nsingle best state path: \n', path)
        # print('delta:\n', delta)
        # print('phi:\n', phi)

        # exit(0)

        state_map = {0: 'long', 1: 'short', 2: 'elision'}
        state_path = [state_map[v] for v in path]

        print((pd.DataFrame().assign(Observation=obs_seq).assign(
            Best_Path=state_path)))

    def pedecerto_df_labels_to_str(df):
        df['length'] = np.where(df['length'] == 0, 'short', df['length'])
        df['length'] = np.where(df['length'] == 1, 'long', df['length'])
        df['length'] = np.where(df['length'] == 2, 'elision', df['length'])

        return df

    # define Viterbi algorithm for shortest path
    # code adapted from Stephen Marsland's, Machine Learning An Algorthmic Perspective, Vol. 2
    # https://github.com/alexsosn/MarslandMLAlgo/blob/master/Ch16/HMM.py
    def viterbi(self, pi, a, b, obs):

        nStates = np.shape(b)[0]
        T = np.shape(obs)[0]

        # init blank path
        path = path = np.zeros(T, dtype=int)
        # delta --> highest probability of any path that reaches state i
        delta = np.zeros((nStates, T))
        # phi --> argmax by time step for each state
        phi = np.zeros((nStates, T))

        # init delta and phi
        delta[:, 0] = pi * b[:, obs[0]]
        phi[:, 0] = 0

        print('\nStart Walk Forward\n')
        # the forward algorithm extension
        for t in range(1, T):
            for s in range(nStates):
                delta[s, t] = np.max(delta[:, t - 1] * a[:, s]) * b[s, obs[t]]
                phi[s, t] = np.argmax(delta[:, t - 1] * a[:, s])
                print('s={s} and t={t}: phi[{s}, {t}] = {phi}'.format(
                    s=s, t=t, phi=phi[s, t]))

        # find optimal path
        print('-' * 50)
        print('Start Backtrace\n')
        path[T - 1] = np.argmax(delta[:, T - 1])
        for t in range(T - 2, -1, -1):
            path[t] = phi[path[t + 1], [t + 1]]
            print('path[{}] = {}'.format(t, path[t]))

        return path, delta, phi

    def create_sentence_list(self) -> list:

        df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'pedecerto_df'))
        # Entire Aneid is put into a list. In this list, a list is dedicated per sentence.
        # Each sentence list has tuples consisting of a syllable and its length.

        # Convert the labels from int to str
        df['length'] = np.where(df['length'] == 0, 'short', df['length'])
        df['length'] = np.where(df['length'] == 1, 'long', df['length'])
        df['length'] = np.where(df['length'] == 2, 'elision', df['length'])

        all_sentences_list = []

        # Get number of books to process
        num_books = df['book'].max()

        # for i in range(num_books):
        for i in Bar('Processing').iter(range(num_books)):
            # Get only lines from this book
            current_book = i + 1
            book_df = df.loc[df['book'] == current_book]

            num_lines = book_df['line'].max()

            for j in range(num_lines):
                current_line = j + 1

                filtered_df = book_df[book_df["line"] == current_line]

                length_list = filtered_df['length'].tolist()
                # syllable_list = filtered_df['syllable'].tolist()

                # combined_list = [(syllable_list[i], length_list[i]) for i in range(0, len(length_list))]

                all_sentences_list.append(length_list)

        util.Pickle_write(self.cf.get('Pickle', 'path'),
                          self.cf.get('Pickle', 'label_list'),
                          all_sentences_list)

        return all_sentences_list

    def create_hidden_transition_matrix_alpha(self, hidden_states):
        # Now we are going to fill the hidden transition matrix
        a_df = pd.DataFrame(columns=hidden_states, index=hidden_states)

        ll = 0
        ls = 0
        le = 0
        sl = 0
        ss = 0
        se = 0
        el = 0
        es = 0
        ee = 0

        total_count = 0

        for sentence in label_list:

            syllable_count = len(sentence)

            for idx, syllable in enumerate(sentence):

                if idx + 1 < syllable_count:

                    item1 = sentence[idx]
                    item2 = sentence[idx + 1]

                    if item1 == 'long' and item2 == 'long': ll += 1
                    elif item1 == 'long' and item2 == 'short': ls += 1
                    elif item1 == 'long' and item2 == 'elision': le += 1
                    elif item1 == 'short' and item2 == 'long': sl += 1
                    elif item1 == 'short' and item2 == 'short': ss += 1
                    elif item1 == 'short' and item2 == 'elision': se += 1
                    elif item1 == 'elision' and item2 == 'long': el += 1
                    elif item1 == 'elision' and item2 == 'short': es += 1
                    elif item1 == 'elision' and item2 == 'elision': ee += 1
                    else:
                        raise Exception('unknown transition found')

                else:
                    break

            total_count += syllable_count - 1

            # print(syllable_count)
            # exit(0)

        prob_ll = ll / total_count
        prob_ls = ls / total_count
        prob_le = le / total_count
        prob_sl = sl / total_count
        prob_ss = ss / total_count
        prob_se = se / total_count
        prob_el = el / total_count
        prob_es = es / total_count
        prob_ee = ee / total_count

        a_df.loc[hidden_states[0]] = [prob_ll, prob_ls, prob_le]
        a_df.loc[hidden_states[1]] = [prob_sl, prob_ss, prob_se]
        a_df.loc[hidden_states[2]] = [prob_el, prob_es, prob_ee]

        util.Pickle_write(self.cf.get('Pickle', 'path'),
                          self.cf.get('Pickle', 'hmm_a'), a_df)

        return a_df

    def create_hidden_transition_matrix_beta(self, observable_states,
                                             hidden_states):

        b_df = pd.DataFrame(columns=observable_states, index=hidden_states)

        total_syllable_count = len(pedecerto_df)

        pedecerto_df['length'] = np.where(pedecerto_df['length'] == 0, 'short',
                                          pedecerto_df['length'])
        pedecerto_df['length'] = np.where(pedecerto_df['length'] == 1, 'long',
                                          pedecerto_df['length'])
        pedecerto_df['length'] = np.where(pedecerto_df['length'] == 2,
                                          'elision', pedecerto_df['length'])

        for syllable in unique_syllables:
            filtered_df = pedecerto_df[pedecerto_df["syllable"] == syllable]

            filter = filtered_df['length'].value_counts()

            try:
                b_df.at['long',
                        syllable] = filter['long'] / total_syllable_count
            except:
                pass
            try:
                b_df.at['short',
                        syllable] = filter['short'] / total_syllable_count
            except:
                pass

            try:
                b_df.at['elision',
                        syllable] = filter['elision'] / total_syllable_count
            except:
                pass

        b_df = b_df.fillna(0)
        util.Pickle_write(self.cf.get('Pickle', 'path'),
                          self.cf.get('Pickle', 'hmm_b'), b_df)

        return b_df

    def get_label_probabilities(self):
        # print(pedecerto_df)

        pedecerto_df['length'] = np.where(pedecerto_df['length'] == 0, 'short',
                                          pedecerto_df['length'])
        pedecerto_df['length'] = np.where(pedecerto_df['length'] == 1, 'long',
                                          pedecerto_df['length'])
        pedecerto_df['length'] = np.where(pedecerto_df['length'] == 2,
                                          'elision', pedecerto_df['length'])

        filter = pedecerto_df['length'].value_counts()

        # print(filter)

        long = filter['long'] / len(pedecerto_df)
        short = filter['short'] / len(pedecerto_df)
        elision = filter['elision'] / len(pedecerto_df)

        # Return the probabilities of each hidden state
        return [long, short, elision]