Exemple #1
0
    def __init__(self):

        # Read all the data we will be needing. The syllable_label_list contains a list of the used texts in [(syl, lbl), (syl,lbl), ...] format.
        crf_df = util.Pickle_read(util.cf.get('Pickle', 'path'),
                                  util.cf.get('Pickle', 'crf_df'))
        # Load training and test set: X contains syllables and their features, y contains only scansion labels per line
        X = util.Pickle_read(util.cf.get('Pickle', 'path'),
                             util.cf.get('Pickle', 'crf_X'))
        y = util.Pickle_read(util.cf.get('Pickle', 'path'),
                             util.cf.get('Pickle', 'crf_y'))
        # Load our latest CRF model
        crf_model = util.Pickle_read(util.cf.get('Pickle', 'path'),
                                     util.cf.get('Pickle', 'crf_model'))

        if self.perform_pedecerto_conversion:
            # Converts the pedecerto dataframe to a syllable_label_list as required by the used CRF suite
            texts = util.Create_files_list('./pickle', 'syllable_label')
            crf_df = self.convert_pedecerto_to_crf_df(texts)
            util.Pickle_write(util.cf.get('Pickle', 'path'),
                              util.cf.get('Pickle', 'crf_df'), crf_df)

        if self.perform_convert_text_to_feature_sets:
            # Takes the syllable label list and adds features to each syllable that are relevant for scansion
            X, y = self.convert_text_to_feature_sets(crf_df)
            util.Pickle_write(util.cf.get('Pickle', 'path'),
                              util.cf.get('Pickle', 'crf_X'), X)
            util.Pickle_write(util.cf.get('Pickle', 'path'),
                              util.cf.get('Pickle', 'crf_y'), y)

        if self.perform_fit_model:
            # Fit the model if needed
            crf_model = self.fit_model(X, y)
            self.print_crf_items(crf_model)
            util.Pickle_write(util.cf.get('Pickle', 'path'),
                              util.cf.get('Pickle', 'crf_model'), crf_model)

        if self.perform_kfold:
            # Perform kfold to check if we don't have any overfitting
            result = self.kfold_model(crf_df, X, y, 5)
            util.Pickle_write(util.cf.get('Pickle', 'path'),
                              util.cf.get('Pickle', 'crf_kfold_result'),
                              result)
            print(result)

        if self.custom_predict:
            # Predict a custom sentence. NB: this has to be syllabified by the user
            custom_sentence = "li to ra mul tum il le et ter ris iac ta tus et al to"
            custom_sentence = "ar ma vi rum que ca no troi ae qui pri mus ab or is"
            self.predict_custom_sentence(crf_model, custom_sentence)

        if self.perform_grid_search:
            # Does what it says on the tin
            self.grid_search(X, y)

        if self.perform_prediction_df:
            # Creates a simple prediction dataframe used by the frontend to quickly load results
            self.create_prediction_df(X, y)

        if self.perform_experiments:
            self.run_experiments()
Exemple #2
0
    def create_prediction_df(self, X, y):
        # Creates a dataframe with predictions. Used by OSCC (for now)
        df = util.Pickle_read(util.cf.get('Pickle', 'path'),
                              util.cf.get('Pickle', 'flattened_vectors'))
        crf = util.Pickle_read(util.cf.get('Pickle', 'path'),
                               util.cf.get('Pickle', 'crf_model'))

        yhat = crf.predict(X)

        column_names = ["predicted", "expected"]
        new_df = pd.DataFrame(columns=column_names)

        for i in Bar('Processing').iter(range(len(y))):
            new_line = {'expected': y[i], 'predicted': yhat[i]}
            new_df = new_df.append(new_line, ignore_index=True)

        book_line_df = df[['book', 'line', 'syllable']]

        prediction_df = pd.concat([book_line_df, new_df], axis=1, join='inner')

        print(prediction_df)

        util.Pickle_write(util.cf.get('Pickle', 'path'),
                          util.cf.get('Pickle', 'seqlab_prediction_df'),
                          prediction_df)
Exemple #3
0
    def grid_search(self, X, y):

        if int(util.cf.get('Util', 'verbose')): print('Starting Gridsearch')
        X_train = X[:9000]
        y_train = y[:9000]
        X_test = X[9001:]
        y_test = y[9001:]

        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   max_iterations=100,
                                   all_possible_transitions=True)
        params_space = {
            'c1': scipy.stats.expon(scale=0.5),
            'c2': scipy.stats.expon(scale=0.05),
        }
        # use the same metric for evaluation
        f1_scorer = metrics.make_scorer(metrics.flat_f1_score,
                                        average='weighted',
                                        labels=self.labels)

        # search
        rs = RandomizedSearchCV(crf,
                                params_space,
                                cv=3,
                                verbose=1,
                                n_jobs=-1,
                                n_iter=50,
                                scoring=f1_scorer)
        rs.fit(X_train, y_train)

        print('best params:', rs.best_params_)
        print('best CV score:', rs.best_score_)
        print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ /
                                            1000000))

        sorted_labels = sorted(self.labels,
                               key=lambda name: (name[1:], name[0]))

        crf = rs.best_estimator_
        y_pred = crf.predict(X_test)
        print(
            metrics.flat_classification_report(y_test,
                                               y_pred,
                                               labels=sorted_labels,
                                               digits=3))

        util.Pickle_write(util.cf.get('Pickle', 'path'),
                          util.cf.get('Pickle', 'seq_lab_rs'), rs)
Exemple #4
0
    def Create_prediction_df(self, df, X, y, yhat):
        # Creates a dataframe with predictions. Used by OSCC (for now)
        column_names = ["predicted", "expected"]
        new_df = pd.DataFrame(columns=column_names)

        for i in Bar('Processing').iter(range(len(X))):
            new_line = {'expected': y[i], 'predicted': yhat[i]}
            new_df = new_df.append(new_line, ignore_index=True)

        book_line_df = df[['book', 'line', 'syllable']]

        prediction_df = pd.concat([book_line_df, new_df], axis=1, join='inner')

        print(prediction_df)

        util.Pickle_write(self.cf.get('Pickle', 'path'),
                          self.cf.get('Pickle', 'prediction_df'),
                          prediction_df)
Exemple #5
0
    def create_sentence_list(self) -> list:

        df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'pedecerto_df'))
        # Entire Aneid is put into a list. In this list, a list is dedicated per sentence.
        # Each sentence list has tuples consisting of a syllable and its length.

        # Convert the labels from int to str
        df['length'] = np.where(df['length'] == 0, 'short', df['length'])
        df['length'] = np.where(df['length'] == 1, 'long', df['length'])
        df['length'] = np.where(df['length'] == 2, 'elision', df['length'])

        all_sentences_list = []

        # Get number of books to process
        num_books = df['book'].max()

        # for i in range(num_books):
        for i in Bar('Processing').iter(range(num_books)):
            # Get only lines from this book
            current_book = i + 1
            book_df = df.loc[df['book'] == current_book]

            num_lines = book_df['line'].max()

            for j in range(num_lines):
                current_line = j + 1

                filtered_df = book_df[book_df["line"] == current_line]

                length_list = filtered_df['length'].tolist()
                # syllable_list = filtered_df['syllable'].tolist()

                # combined_list = [(syllable_list[i], length_list[i]) for i in range(0, len(length_list))]

                all_sentences_list.append(length_list)

        util.Pickle_write(self.cf.get('Pickle', 'path'),
                          self.cf.get('Pickle', 'label_list'),
                          all_sentences_list)

        return all_sentences_list
Exemple #6
0
    def create_hidden_transition_matrix_beta(self, observable_states,
                                             hidden_states):

        b_df = pd.DataFrame(columns=observable_states, index=hidden_states)

        total_syllable_count = len(pedecerto_df)

        pedecerto_df['length'] = np.where(pedecerto_df['length'] == 0, 'short',
                                          pedecerto_df['length'])
        pedecerto_df['length'] = np.where(pedecerto_df['length'] == 1, 'long',
                                          pedecerto_df['length'])
        pedecerto_df['length'] = np.where(pedecerto_df['length'] == 2,
                                          'elision', pedecerto_df['length'])

        for syllable in unique_syllables:
            filtered_df = pedecerto_df[pedecerto_df["syllable"] == syllable]

            filter = filtered_df['length'].value_counts()

            try:
                b_df.at['long',
                        syllable] = filter['long'] / total_syllable_count
            except:
                pass
            try:
                b_df.at['short',
                        syllable] = filter['short'] / total_syllable_count
            except:
                pass

            try:
                b_df.at['elision',
                        syllable] = filter['elision'] / total_syllable_count
            except:
                pass

        b_df = b_df.fillna(0)
        util.Pickle_write(self.cf.get('Pickle', 'path'),
                          self.cf.get('Pickle', 'hmm_b'), b_df)

        return b_df
Exemple #7
0
    def __init__(self, path):

        # Create pandas dataframe
        column_names = ["title", "line", "syllable", "length"]
        df = pd.DataFrame(columns=column_names)

        # Add all entries to process to a list
        entries = util.Create_files_list(path, 'xml')
        # Process all entries added to the list
        for entry in entries:
            with open(path + entry) as fh:
                # for each text, an individual dataframe will be created and saved as pickle
                new_text_df = copy.deepcopy(df)
                pickle_name = 'syllable_label_' + entry + '.pickle'

                # Use beautiful soup to process the xml
                soupedEntry = BeautifulSoup(fh, "xml")
                # Retrieve the title and author from the xml file
                text_title = str(soupedEntry.title.string)
                author = str(soupedEntry.author.string)
                # Clean the lines (done by MQDQ)
                soupedEntry = util.clean(soupedEntry('line'))

                # for line in range(len(soupedEntry)):
                for line in Bar('Processing {0}, {1}'.format(
                        author, text_title)).iter(range(len(soupedEntry))):
                    book_title = int(soupedEntry[line].parent.get('title'))
                    # Process the entry. It will append the line to the df
                    if not soupedEntry[line]['name'].isdigit():
                        continue  # If our line name is not a digit, the line is uncertain. We skip over it
                    line_df = self.Process_line(soupedEntry[line], book_title)
                    new_text_df = new_text_df.append(
                        line_df, ignore_index=True
                    )  # If I greatly improve my own code, am I a wizard, or a moron?

                # Clean the lines that did not succeed
                new_text_df = self.clean_generated_df(new_text_df)

                util.Pickle_write(util.cf.get('Pickle', 'path'), pickle_name,
                                  new_text_df)
Exemple #8
0
    def __init__(self, df):
        # Read the config file for later use
        self.cf = configparser.ConfigParser()
        self.cf.read("config.ini")

        # Control flow booleans
        add_padding = False  #True
        flatten_vector = False
        create_model = False
        test_model = True

        load_X_y = False

        # This functions add padding to every line
        if add_padding:
            print('Adding padding')
            df = self.Add_padding(df)
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'padded_set'), df)

        df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'padded_set'))
        if self.cf.get('Util', 'verbose'): print(df)

        if flatten_vector:
            # The network wants a single vector as input, so we flatten it for every line in the text
            print('Flattening the vectors')
            df = self.Flatten_dataframe_column(df, 'vector')
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'flattened_vectors'), df)
        df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'flattened_vectors'))
        if self.cf.get('Util', 'verbose'): print(df)

        ####
        # TODO: Philippe plz continue here
        ####)

        # Turn df into X and y for neural network
        print('Loading X and y')
        X, y = self.Create_X_y(df, load_X_y)
        # print("Training data: shape={}".format(X.shape))
        # print("Training target data: shape={}".format(y.shape))

        # Encode: 2 for elision and 3 for padding (0 for short, 1 for long)
        # y[y == 2] = 1 # Dont forget, y is numpy.ndarray
        # y[y == 3] = 1

        if create_model:
            ''' 
            README
            The main problem at the moment is as follows. I give a single input (ndarray), which is a line of 20 syllables, represented 
            by vectors of dimension 25 (X.shape = 500). The output i want is 20 dimensional, one for each syllable. The output i want is
            multiclass: each syllable can be short, long, elided or simply padding (encoded as 0, 1, 2 or 3). The problem is therefore 
            multiclass: each of the 20 outputs can have one and only one label. If i use binary classification and encode elision and padding
            as 1 as well, it seems to work quite well (see docs/first_four_lines.txt). If i use categorical classification and try to predict
            classes, it doesnt work as intended. Questions: can i just do multiclass prediction, or do i need to binarize my labels? Do we
            need scaling. Can we do multi output multi class the way i implemented it?
            '''

            labels = ['short', 'long', 'elision', 'padding']

            # TODO: do we need to scale the X data? All values are between -1 and 1.
            # scaler = MinMaxScaler()
            # X_train = scaler.fit_transform(X_train)
            # X_test = scaler.transform(X_test)

            #TODO: i have four labels i want to predict. do we need to binarize this?
            # mlb = MultiLabelBinarizer()
            # y = mlb.fit_transform(y)

            # one hot encode output variable (for class prediction)
            # y = to_categorical(y, num_classes=4)

            # Create and evaluate the model. Uses k-fold cross validation
            model = self.Evaluate_model(X, y)

            model.save('pickle/model')

        if test_model:
            # Load if needed. Now I just create the model every time (10 epochs)
            model = models.load_model('pickle/model')

            # TODO: i can predict using binary, but i need to predict classes. predict_classes is deprecated
            # However, the model.predict(X).argmax(axis=-1) results in the network predicting a single int64?
            # yhat = model.predict_classes(x_new)
            # yhat = model.predict(X).argmax(axis=-1) #model.predict_classes(X)
            # print([labels[i] for i in model.predict(X).argmax(axis=-1)])

            # This works fine for binary classification
            yhat = model.predict(X)

            ### Uncomment this if you want to create a prediction dataframe
            # self.Create_prediction_df(df, X, y, yhat)

            # Predict and test the first 10 lines. Also, print the similarity of predicted and expected
            for i in range(10):

                print('Expected : {0}'.format(y[i]))

                try:
                    # Round the number to the next whole number (for readability)
                    round_to_whole = [round(num) for num in yhat[i]]
                    print('Predicted: {0}'.format(round_to_whole))
                    res = self.Calculate_list_similarity(y[i], round_to_whole)
                    print('Similarity score for line {0}: {1}'.format(i, res))
                except:
                    print('Predicted: %s' % yhat[i])

                print('\n')
Exemple #9
0
    def __init__(self):

        # Load the pedecerto df and convert its integer labels to strings
        self.pedecerto_df = self.pedecerto_df_labels_to_str(self.pedecerto_df)

        # Create hidden state space (our labels)
        hidden_states = ['long', 'short', 'elision']

        # create hidden transition matrix alpha
        # this is the transition probability matrix of changing states given a state
        # matrix is size (M x M) where M is number of states

        # create state space and initial state probabilities
        if self.perform_sentence_transition_list:
            self.label_list = self.create_sentence_transition_list(
                self.pedecerto_df)
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'label_list'),
                              self.label_list)

        if self.perform_matrix_alpha_creation:
            a_df = self.create_hidden_transition_matrix_alpha(
                hidden_states, self.label_list)
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'hmm_a'), a_df)

        a_df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                                self.cf.get('Pickle', 'hmm_a'))
        if self.cf.get('Util', 'verbose'): print(a_df)

        # create matrix of observation (emission) probabilities beta.
        # this holds the observation probabilities given state.
        # matrix is size (M x O) where M is number of states
        # and O is number of different possible observations.
        unique_syllables = sorted(set(self.pedecerto_df['syllable'].tolist()))
        observable_states = unique_syllables  # Our observations are all of our unique syllables

        if self.perform_matrix_beta_creation:
            b_df = self.create_hidden_transition_matrix_beta(
                observable_states, hidden_states, unique_syllables,
                self.pedecerto_df)
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'hmm_b'), b_df)

        b_df = util.Pickle_read(self.cf.get('Pickle', 'path'),
                                self.cf.get('Pickle', 'hmm_b'))
        if self.cf.get('Util', 'verbose'): print(b_df)

        custom_sentence = "li to ra mul tum il le et ter ris jac ta tus et al to"
        custom_sentence = "ar ma vi rum que ca no troi ae qui pri mus ab or is"

        # Get parameters ready for the viterbi walks
        pi = self.get_label_probabilities(self.pedecerto_df)
        a = a_df.values
        b = b_df.values

        if self.perform_viterbi_walks:
            y_true, y_pred = self.create_y(self.pedecerto_df,
                                           observable_states, pi, a, b)
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'hmm_y_pred'), y_pred)
            util.Pickle_write(self.cf.get('Pickle', 'path'),
                              self.cf.get('Pickle', 'hmm_y_true'), y_true)

        y_true = util.Pickle_read(self.cf.get('Pickle', 'path'),
                                  self.cf.get('Pickle', 'hmm_y_true'))
        y_pred = util.Pickle_read(self.cf.get('Pickle', 'path'),
                                  self.cf.get('Pickle', 'hmm_y_pred'))

        self.create_prediction_df(y_true, y_pred)

        print('##########################################################')
        print(self.get_metrics_report(y_true, y_pred))
        print('##########################################################')
Exemple #10
0
    def run_experiments(self):
        create_models = True

        crf_exp_bar_dict = {
            'exp1': {},
            'exp2': {},
            'exp3': {},
            'exp4': {},
            'exp5': {}
        }

        # Experiment 1: Create model on Virgil, test on Virgil
        if create_models:
            texts = ['syllable_label_VERG-aene.xml.pickle']
            crf_df = self.convert_pedecerto_to_crf_df(texts)
            X, y = self.convert_text_to_feature_sets(crf_df)
            util.Pickle_write(util.cf.get('Pickle', 'path'),
                              'crf_exp1_X.pickle', X)
            util.Pickle_write(util.cf.get('Pickle', 'path'),
                              'crf_exp1_y.pickle', y)
        else:
            X = util.Pickle_read(util.cf.get('Pickle', 'path'),
                                 'crf_exp1_X.pickle')
            y = util.Pickle_read(util.cf.get('Pickle', 'path'),
                                 'crf_exp1_y.pickle')

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42,
                                                            shuffle=True)
        crf_model = self.fit_model(X_train, y_train)
        result = self.predict_model(crf_model, X_test, y_test)
        print('exp1')

        crf_exp_bar_dict['exp1'] = {
            'short_precision': result['short']['precision'],
            'short_recall': result['short']['recall'],
            'long_precision': result['long']['precision'],
            'long_recall': result['long']['recall'],
            'elision_precision': result['elision']['precision'],
            'elision_recall': result['elision']['recall'],
        }

        # Experiment 4: Test Virgil on Hercules Furens
        herc_df = pd.read_csv('HercFur.csv')
        util.Pickle_write(util.cf.get('Pickle', 'path'),
                          util.cf.get('Pickle', 'test'), herc_df)
        herc_df = self.convert_pedecerto_to_crf_df(['test.pickle'])
        X_herc, y_herc = self.convert_text_to_feature_sets(herc_df)
        result = self.predict_model(crf_model, X_herc, y_herc)
        print('exp4')

        crf_exp_bar_dict['exp4'] = {
            'short_precision': result['short']['precision'],
            'short_recall': result['short']['recall'],
            'long_precision': result['long']['precision'],
            'long_recall': result['long']['recall'],
            'elision_precision': result['elision']['precision'],
            'elision_recall': result['elision']['recall'],
        }

        # Create model on Virgil, Ovid, Iuvenal and Lucretius, test on Aeneid
        texts = util.Create_files_list('./pickle', 'syllable_label')
        crf_df = self.convert_pedecerto_to_crf_df(texts)
        X, y = self.convert_text_to_feature_sets(crf_df)
        X_train, _, y_train, _ = train_test_split(X,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  shuffle=True)
        crf_model = self.fit_model(X_train, y_train)
        result = self.predict_model(crf_model, X_test, y_test)
        print('exp2')

        crf_exp_bar_dict['exp2'] = {
            'short_precision': result['short']['precision'],
            'short_recall': result['short']['recall'],
            'long_precision': result['long']['precision'],
            'long_recall': result['long']['recall'],
            'elision_precision': result['elision']['precision'],
            'elision_recall': result['elision']['recall'],
        }

        # Create model on Virgil, Ovid, Iuvenal and Lucreatius, test on all
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42,
                                                            shuffle=True)
        crf_model = self.fit_model(X_train, y_train)
        result = self.predict_model(crf_model, X_test, y_test)
        print('exp3')

        crf_exp_bar_dict['exp3'] = {
            'short_precision': result['short']['precision'],
            'short_recall': result['short']['recall'],
            'long_precision': result['long']['precision'],
            'long_recall': result['long']['recall'],
            'elision_precision': result['elision']['precision'],
            'elision_recall': result['elision']['recall'],
        }

        util.Pickle_write(util.cf.get('Pickle', 'path'),
                          util.cf.get('Pickle', 'test'), crf_exp_bar_dict)

        # herc_df = pd.read_csv('HercFur.csv')
        # crf_df = self.convert_pedecerto_to_crf_df(['test.pickle'])
        # X, y = self.convert_text_to_feature_sets(herc_df)
        result = self.predict_model(crf_model, X_herc, y_herc)
        print('exp5')
        crf_exp_bar_dict['exp5'] = {
            'short_precision': result['short']['precision'],
            'short_recall': result['short']['recall'],
            'long_precision': result['long']['precision'],
            'long_recall': result['long']['recall'],
            'elision_precision': result['elision']['precision'],
            'elision_recall': result['elision']['recall'],
        }

        pd.DataFrame(crf_exp_bar_dict).T.plot(kind='bar')
        plt.legend(loc='lower left')
        plt.ylim([0.5, 1])
        plt.savefig('./result.png')
        plt.show()
Exemple #11
0
# Parameters to run each step
run_preprocessor = False
run_pedecerto = True
run_model_generator = False
add_embeddings_to_df = False
run_neural_network = False
''' Run the preprocessor on the given text if needed.
This reads the text, cleans it and returns a list of syllables for now
To achieve this, the pedecerto tool is used
'''
if run_preprocessor:
    print('Running preprocessor')
    preprocessor = Text_preprocessor(util.cf.get('Text', 'name'))
    util.Pickle_write(util.cf.get('Pickle', 'path'),
                      util.cf.get('Pickle', 'char_list'),
                      preprocessor.character_list)

# Load the preprocessed text
character_list = util.Pickle_read(util.cf.get('Pickle', 'path'),
                                  util.cf.get('Pickle', 'char_list'))
if int(util.cf.get('Util', 'verbose')): print(character_list)
''' Now create a dataframe. Containing: syllable, length, vector.
'''
if run_pedecerto:
    print('Running pedecerto parser')
    parse = Pedecerto_parser(util.cf.get('Pedecerto', 'path_texts'))
    # This function created pickle files for all texts that are in the ./texts/ folder

pedecerto_df = util.Pickle_read(util.cf.get('Pickle', 'path'),
                                util.cf.get('Pickle', 'pedecerto_df'))
Exemple #12
0
    def create_hidden_transition_matrix_alpha(self, hidden_states):
        # Now we are going to fill the hidden transition matrix
        a_df = pd.DataFrame(columns=hidden_states, index=hidden_states)

        ll = 0
        ls = 0
        le = 0
        sl = 0
        ss = 0
        se = 0
        el = 0
        es = 0
        ee = 0

        total_count = 0

        for sentence in label_list:

            syllable_count = len(sentence)

            for idx, syllable in enumerate(sentence):

                if idx + 1 < syllable_count:

                    item1 = sentence[idx]
                    item2 = sentence[idx + 1]

                    if item1 == 'long' and item2 == 'long': ll += 1
                    elif item1 == 'long' and item2 == 'short': ls += 1
                    elif item1 == 'long' and item2 == 'elision': le += 1
                    elif item1 == 'short' and item2 == 'long': sl += 1
                    elif item1 == 'short' and item2 == 'short': ss += 1
                    elif item1 == 'short' and item2 == 'elision': se += 1
                    elif item1 == 'elision' and item2 == 'long': el += 1
                    elif item1 == 'elision' and item2 == 'short': es += 1
                    elif item1 == 'elision' and item2 == 'elision': ee += 1
                    else:
                        raise Exception('unknown transition found')

                else:
                    break

            total_count += syllable_count - 1

            # print(syllable_count)
            # exit(0)

        prob_ll = ll / total_count
        prob_ls = ls / total_count
        prob_le = le / total_count
        prob_sl = sl / total_count
        prob_ss = ss / total_count
        prob_se = se / total_count
        prob_el = el / total_count
        prob_es = es / total_count
        prob_ee = ee / total_count

        a_df.loc[hidden_states[0]] = [prob_ll, prob_ls, prob_le]
        a_df.loc[hidden_states[1]] = [prob_sl, prob_ss, prob_se]
        a_df.loc[hidden_states[2]] = [prob_el, prob_es, prob_ee]

        util.Pickle_write(self.cf.get('Pickle', 'path'),
                          self.cf.get('Pickle', 'hmm_a'), a_df)

        return a_df