Python encode_item Exemples, corpus.encode.item.encode_item Python Exemples

Exemple #1

0

Afficher le fichier

def encode_corpus(corpus_name,
                  celex_dict,
                  tokens2identifiers,
                  pos_dict,
                  separator='~',
                  uniphones=False,
                  diphones=True,
                  triphones=False,
                  syllables=False,
                  stress_marker=True,
                  outcomes='tokens',
                  boundaries=False):
    """
    :param corpus_name:         the path indicating the .json file to be used as input corpus
    :param celex_dict:          the Celex dictionary to be used to recode the utterances into phonetic cues
    :param tokens2identifiers:  a dictionary mapping a token surface form from Celex to all token ids linked to it
    :param pos_dict:            a dictionary mapping CHILDES PoS tags to corresponding Celex PoS tags
    :param separator:           a string indicating the character separating lemmas from PoS tags in the input corpus
    :param uniphones:           a boolean indicating whether uni-phones are relevant phonetic cues
    :param diphones:            a boolean indicating whether di-phones are relevant phonetic cues
    :param triphones:           a boolean indicating whether tri-phones are relevant phonetic cues
    :param syllables:           a boolean indicating whether syllables are relevant phonetic cues
    :param stress_marker:       a boolean indicating whether to discard or not the stress marker from the Celex phonetic
                                transcriptions
    :param outcomes:            a string indicating which outcomes to use, whether 'tokens' (default) or 'lemmas'
    :param boundaries:          a boolean indicating whether to preserve or discard word boundaries
    :return encoded corpus:     the input corpus recoded as a list of lists, where each inner list is a learning event
                                and consist of two sub-lists, the first containing phonetic cues and the second
                                containing lexical outcomes
    :return perc_missed:        the percentage of learning events from the input corpus that could not be recoded
                                because the Celes dictionary didn't contain the necessary information
    """

    # get a dictionary mapping utterance indices to the percentage of corpus that has been processed up to the
    # utterance itself
    corpus = json.load(open(corpus_name, 'r+'))
    total = len(corpus[0])
    check_points = {
        np.floor(total / float(100) * n): n
        for n in np.linspace(5, 100, 20)
    }

    encoded_corpus = [[], []]
    missed = 0

    # for every utterance in the input corpus, remove words with a PoS tag that doesn't belong to the
    # dictionary of PoS mappings; then map valid words to the right PoS tag as indicated by the PoS dictionary
    for i in range(len(corpus[0])):
        words = []
        for j in range(len(corpus[0][i])):
            lemma, pos_tag = corpus[1][i][j].split(separator)
            if pos_tag in pos_dict:
                token = corpus[0][i][j]
                new_tag = pos_dict[pos_tag]
                words.append((token, new_tag, lemma))

        # if there are valid words in the utterance, encode it
        if 0 < len(words) <= 20:

            # get the phonetic encoding of the words in the current learning trial:
            # if they can all be encoded using Celex, a list is returned, other wise a tuple is
            phonological_representations = get_phonetic_encoding(
                words, celex_dict, tokens2identifiers)

            # if a phonological representation could be found for all words in the utterance, proceed
            if isinstance(phonological_representations, list):

                utterance = concatenate_phonological_representations(
                    phonological_representations)
                table = str.maketrans(dict.fromkeys('"'))
                utterance = utterance.translate(table)

                n_phones = encode_item(utterance,
                                       stress_marker=stress_marker,
                                       boundaries=boundaries,
                                       uniphones=uniphones,
                                       diphones=diphones,
                                       triphones=triphones,
                                       syllables=syllables)

                outcomes_set = set()
                for word in words:
                    token, pos, lemma = word
                    if outcomes == 'tokens':
                        outcomes_set.add('|'.join([token, pos]))
                    elif outcomes == 'lemmas':
                        outcomes_set.add('|'.join([lemma, pos]))
                    else:
                        raise ValueError(
                            "Unrecognized specification concerning lexical outcomes. "
                            "Please, choose either 'tokens' or 'lemmas'.")

                # append the phonetic representation of the current learning event to the list of phonetic
                # representations for the whole corpus, and the lexical meanings of the current learning event to
                # the list of lexical meanings for the whole corpus
                encoded_corpus[0].append(n_phones)
                encoded_corpus[1].append(list(outcomes_set))

            # if the phonological representation of a word from the utterance could not be retrieved from
            # CELEX, count the utterance as missed
            else:
                missed += 1

        if i in check_points:
            print(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": %d%% of the input corpus has been processed and encoded in the desired way."
                % check_points[i])

    perc_missed = missed / float(total) * 100

    return encoded_corpus, perc_missed

Exemple #2

0

Afficher le fichier

def categorize(test_items,
               weights_matrix,
               cues2ids,
               outcomes2ids,
               method='freq',
               evaluation='count',
               stats=False,
               k=50,
               flush=0,
               threshold=0,
               uniphones=True,
               diphones=False,
               triphones=False,
               syllables=False,
               stress_marker=False,
               boundaries=True):
    """
    :param test_items:      an iterable containing strings. Each string is the phonological form of a word together
                            with its PoS tag, separated by a vertical bar ('|')
    :param weights_matrix:  a NumPy array containing the matrix of cue-outcome associations estimated via the ndl
                            module; rows represent cues, columns represent outcomes.
    :param cues2ids:        a Python dictionary mapping cues to row indices in the weight matrix
    :param outcomes2ids:    a Python dictionary mapping outcomes to column indices in the weight matrix
    :param method:          a string indicating the way in which the function looks at top active outcomes; two
                            options are available:
                            - 'freq' makes the function compute the distribution of PoS tags over the k top active
                                nodes (see the explanation of the parameter k) and rank PoS tags according to their
                                frequency among the top active cues
                            - 'sum' makes the function compute the sum of activation from all outcomes belonging to
                                a given PoS tag within the k top active outcomes given the input cues, and rank PoS
                                tags according to their total activation among the top active cues
    :param evaluation:      a string indicating how to compare baseline activations to item-triggered ones; two
                            options are available:
                            - 'count', simply tag the test item with the PoS tag that either was more frequent or
                                had highest summed activation within the top active outcomes; frequency or
                                activation are returned and can be correlated to reaction times
                            - 'distr', compare the frequency counts or summed activations generated by a specific
                                test item to the frequency counts or summed activations at baseline and tag the test
                                item with the PoS tag receiving highest support by the change in the distribution of
                                frequencies or summed activations (a statistic is returned, Chi-squared for
                                frequency distributions and t-test for summed activations, whose value can be
                                correlated to reaction times)
    :param stats:           if True, makes the function assign a PoS tag to a test item based on the result of a
                            statistical test (Chi-squared for frequencies, t-test for activations): with the
                            Chi-squared, the PoS tag is chosen whose Pearson standardised residual is highest for the
                            item-triggered frequency distribution; with the t-test, the PoS tag is chosen that...
                            if False, the PoS tag with the highest positive difference between item-triggered and
                            baseline frequency/activation is chosen
                            CAVEAT: this parameter only makes sense if the 'distr' option is chosen for the 'evaluation'
                                        parameter
    :param k:               an integer specifying how many elements to consider from the baseline activations and
                            the activations triggered by a specific test item. By default, the top 50 outcomes are
                            considered, and compared according to the chosen combination of method and eval
    :param flush:           specify whether (and how many) top active outcome at baseline to flush away from
                            subsequent computations. It may be the case that whatever the input cues, the same high
                            frequency outcomes come out as being the most active. It may then make sense to not
                            consider them when evaluating the distribution of lexical categories over the most
                            active outcomes given an input item
    :param threshold:       the minimum activation of an outcome to be considered in the list of top activated
                            neighbors, default is 0 and shouldn't be lowered, but can be increased.
    :param uniphones:       a boolean indicating whether single phonemes are to be considered while encoding input
                            utterances
    :param diphones:        a boolean indicating whether sequences of two phonemes are to be considered while
                            encoding input utterances
    :param triphones:       a boolean indicating whether sequences of three phonemes are to be considered while
                            encoding input utterances
    :param syllables:       a boolean indicating whether syllables are to be considered while encoding input
                            utterances
    :param stress_marker:   a boolean indicating whether stress markers from the input phonological representation need
                            to be preserved or can be discarded
    :param boundaries:      a boolean specifying whether word boundaries are to be preserved or discarded
    :return f1:             the proportion of items from the input iterable that could be categorized correctly - the
                            PoS tag of the words receiving highest activation given the phonetic cues in each test item
                            matched the PoS tag attached to the test item itself
    :return h:              the normalized entropy of the distribution of PoS tags chosen by the model when tagging test
                            items. If a model always chooses the same PoS tag, the entropy will be minimal; ideally, a
                            good model doesn't behave like a majority baseline model, even though this might result in
                            high accuracy scores
    :return pos:            the PoS tag that is applied most frequently by a model. Useful to spot anomalous
                            over-extension of a PoS tag
    :return freq:           the frequency with which the most frequent PoS tag applied by the model is actually applied
    """

    to_filter = set()
    baseline_activations = compute_outcomes_activations(
        cues2ids.keys(), weights_matrix, cues2ids, outcomes2ids, to_filter)
    sorted_baseline_activations = sorted(baseline_activations.items(),
                                         key=operator.itemgetter(1),
                                         reverse=True)

    # if top active outcomes at baseline need to be flushed away, store flushed outcomes in a set and store the other
    # outcomes in a list of tuples
    if flush:
        to_filter = {
            outcome[0]
            for outcome in sorted_baseline_activations[:flush]
        }
        sorted_baseline_activations = sorted_baseline_activations[flush:]

    # compute baseline frequency distribution over PoS tags and PoS summed activation over the k most active outcomes
    # given all input cues at once
    pos_freq_baseline, pos_act_baseline = get_frequency_and_activation_for_each_pos(
        sorted_baseline_activations[:k])

    hits = 0
    total = 0
    total_items = len(test_items)
    check_points = {
        int(np.floor(total_items / 100 * n)): n
        for n in np.linspace(5, 100, 20)
    }

    log_dict = defaultdict(dict)

    tags = set()
    for item in test_items:

        tags.add(item.split("|")[1])

        if not isinstance(item, str):
            ValueError(
                "The input items must consist of strings: check your input file!"
            )

        # split the test token from its Part-of-Speech and encode it in nphones
        word, target_pos = item.split('|')
        if boundaries:
            word = '+' + word + '+'
        nphones = encode_item(word,
                              uniphones=uniphones,
                              diphones=diphones,
                              triphones=triphones,
                              syllables=syllables,
                              stress_marker=stress_marker)

        # compute outcome activations given the phonetic cues in the test item, sort the outcomes by outcome activation
        # value and pick the top k
        outcome_activations = compute_outcomes_activations(
            nphones, weights_matrix, cues2ids, outcomes2ids, to_filter)
        sorted_outcome_activations = sorted(outcome_activations.items(),
                                            key=operator.itemgetter(1),
                                            reverse=True)[:k]

        if sorted_outcome_activations[0][1] == 0:
            # if the activation of the first item is 0 it means that no phonetic cue from the test item was ever
            # encountered in the corpus, and it is thus impossible to estimate which are most active outcomes given
            # the phonetic cues of which the test item consists
            top_pos, value, n_freq, v_freq, n_act, v_act = ('-', 0, 0, 0, 0, 0)

        else:

            # make sure that most active outcomes all have activation values higher than the indicated threshold
            if not sorted_outcome_activations[-1][1] > threshold:
                sorted_outcome_activations = [
                    (outcome, act)
                    for outcome, act in sorted_outcome_activations
                    if act > threshold
                ]

            # compute frequency distribution over PoS tags and PoS summed activation over the k most active outcomes
            # given the test item, then get how many verbs and nouns there were among to k top active outcomes
            pos_freq_item, pos_act_item = get_frequency_and_activation_for_each_pos(
                sorted_outcome_activations)
            n_freq = pos_freq_item['N']
            v_freq = pos_freq_item['V']
            n_act = pos_act_item['N']
            v_act = pos_act_item['V']

            # get the most likely PoS tag for the test item given the k top active outcomes
            top_pos, value = pick_pos(pos_freq_item,
                                      pos_act_item,
                                      pos_freq_baseline,
                                      pos_act_baseline,
                                      evaluation=evaluation,
                                      method=method,
                                      stats=stats)

        # print the test item (with the correct PoS tag), the PoS tag assigned by the model, the statistic computed to
        # assign the chosen PoS (frequency/activation count or difference) and the k top active nodes given the test
        # item
        log_dict[item] = {
            'predicted': top_pos,
            'value': value,
            'n_freq': n_freq,
            'v_freq': v_freq,
            'n_act': n_act,
            'v_act': v_act,
            'items': {k: v
                      for (k, v) in sorted_outcome_activations}
        }

        # compare the predicted and true PoS tag and increment the count of hits if they match
        if top_pos == target_pos:
            hits += 1

        total += 1

        if total in check_points:
            print(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": %d%% of the test items have been processed." %
                check_points[total])

    return log_dict

Exemple #3

0

Afficher le fichier

def map_phonology(corpus_file, mapping_file, output_file, celex_dir, compounds=True, reduced=False, minimalist=True):

    """
    :param corpus_file:         the path to a .txt file containing one utterance per line, with all words in the
                                utterance separated by a comma and each word being a tuple consisting of four
                                pipe-separated elements, token|lemma|PoS1|PoS2 where PoS1 is the coarse Celex tag and
                                PoS2 is the tag provided by the TreeTagger
    :param mapping_file:        the path to a .txt file where the output of the process will be written to
    :param output_file:         the path to a .txt file where the lines from the input will be rewritten as
                                comma-separated sequences of pipe-separated 5-tuples consisting of
                                token, lemma, pos1, pos2, 3phones
    :param celex_dir:           the directory where the Celex dictionary is to be found
    :param compounds:           a boolean. If true, all entries in Celex are considered; if False, entries which contain
                                spaces are discarded
    :param minimalist:          a boolean. It specifies whether lemmas in the output file should be differentiated when
                                their phonetic realization changes depending on the part of speech: if minimalist is
                                True, lemmas are not differentiated (default), if it is False, lemmas are differentiated
                                by appending pos1 to the lemma, separated by a colon
    :return mapping:            a dictionary mapping 4-tuples token|lemma|pos1|pos2 to the matching triphones
    """

    celex_dict = get_celex_dictionary(celex_dir, reduced=reduced, compounds=compounds)
    tokens2identifiers = tokens2ids(celex_dict)
    mapping = {}
    lemma2phon = defaultdict(dict)

    new_corpus = []

    with open(corpus_file, 'r') as fr:
        for line in fr:
            words = line.strip().split(',')
            new_line = []
            for word in words:
                if word:
                    try:
                        token, lemma, pos1, pos2 = word.split('|')
                    except ValueError:
                        token, lemma, pos1 = word.split('|')
                        pos2 = 'NN' if pos1 == 'N' else pos1

                    new_token, new_lemma = adjust_apostrophes(token, lemma)
                    new_token = new_token.replace('=', '_')
                    new_lemma = new_lemma.replace('=', '_')
                    token_phonological_form = get_phonetic_encoding([(new_token, pos1, new_lemma)],
                                                                    celex_dict, tokens2identifiers)
                    lemma_phonology = get_phonetic_encoding([(new_lemma, pos1, new_lemma)],
                                                            celex_dict, tokens2identifiers)
                    lemma_phonological_form = ''.join(lemma_phonology) if isinstance(lemma_phonology, list) else \
                        ''.join(token_phonological_form)

                    if isinstance(token_phonological_form, list):
                        triphones = encode_item(token_phonological_form[0], triphones=True, stress_marker=True,
                                                uniphones=False, diphones=False, syllables=False)
                        deriv = code_derivational_morphology(pos2)
                        output_token = token.replace('_', '=')
                        output_lemma = lemma.replace('_', '=')

                        morpho = 'COMPOUND' if '=' in output_token else 'MONO'
                        key = '|'.join([output_token, output_lemma, pos1, pos2, deriv, morpho,
                                        ':'.join([output_token, pos1])])
                        output_triphones = ';'.join(triphones)

                        mapping[key] = output_triphones
                        if lemma_phonological_form in lemma2phon[output_lemma]:
                            lemma2phon[output_lemma][lemma_phonological_form].add(pos1)
                        else:
                            lemma2phon[output_lemma][lemma_phonological_form] = {pos1}

                        new_line.append((output_token, ':'.join([output_lemma, pos1]), pos1, pos2, output_triphones))
            new_corpus.append(new_line)

    write_mapping_file(mapping, mapping_file)
    write_output_corpus(new_corpus, output_file, lemma2phon, minimalist=minimalist)

    return mapping

Exemple #4

0

Afficher le fichier

Fichier : jaccard.py Projet : GiovanniCassani/discriminative_learning

def jaccard(weight_matrix, cues2ids, outcomes2ids, celex_dict, plots_folder='',
            stress_marker=True, uniphone=False, diphone=False, triphone=True, syllable=False, boundaries=True):

    """
    :param weight_matrix:           the matrix of cue-outcome association estimated using the ndl model
    :param cues2ids:                a dictionary mapping strings to row indices in the weight_matrix
    :param outcomes2ids:            a dictionary mapping strings to column indices in the weight_matrix
    :param celex_dict:              the dictionary extracted from the celex database
    :param plots_folder:            a string indicating the path to a folder, where all the plots and files generated by
                                    the function will be stored. The function checks if the folder already exists, and
                                    if it doesn't the function creates it
    :param uniphone:                a boolean indicating whether single phonemes are to be considered while encoding
                                    column identifiers
    :param diphone:                 a boolean indicating whether sequences of two phonemes are to be considered while
                                    encoding column identifiers
    :param triphone:                a boolean indicating whether sequences of three phonemes are to be considered while
                                    encoding column identifiers
    :param syllable:                a boolean indicating whether syllables are to be considered while encoding column
                                    identifiers
    :param stress_marker:           a boolean indicating whether stress markers from the phonological representations of
                                    Celex need to be preserved or can be discarded
    :param boundaries:              a boolean specifying whether to consider or not word boundaries
    :return jaccard_coefficients:   a dictionary mapping outcome surface forms (strings) to the Jaccard coefficient
                                    computed between the gold-standard and most active cues as estimated from the input
                                    matrix. Gold-standard cues are extracted from the outcome phonological form
                                    according the specified encoding; moreover, a vector of length k (where k is the
                                    number of gold-standard cues) is filled with the top k cues for the outcome being
                                    considered, looking at raw activation values. `The Jaccard coefficient is the
                                    proportion between the intersection of the two vectors and their union, telling how
                                    many cues are shared proportionally to how many unique cues there are. The higher
                                    the number, the higher the overlap and the better the network was able to
                                    discriminate the good cues for an outcome.
    """

    # specify the string that identifies plots generated by this function in their file names
    f_name = 'jaccard'

    token_indices = tokens2ids(celex_dict)

    jaccard_coefficients = {}
    true_cues = {}
    active_cues = {}
    ids2cues = dict(zip(cues2ids.values(), cues2ids.keys()))
    total_items = len(outcomes2ids)
    check_points = {int(np.floor(total_items / 100 * n)): n for n in np.linspace(5, 100, 20)}

    # consider each outcome separately
    for idx, outcome in enumerate(outcomes2ids):

        column_id = outcomes2ids[outcome]
        wordform, pos = outcome.split('|')
        celex_entry = (wordform, pos, wordform)
        word_phon = get_phonological_form(celex_entry, celex_dict, token_indices)
        if boundaries:
            word_phon = '+' + word_phon + '+'

        if isinstance(word_phon, str):

            # get the relevant phonological cues
            nphones = encode_item(word_phon, stress_marker=stress_marker, uniphones=uniphone,
                                  diphones=diphone, triphones=triphone, syllables=syllable)

            # get the top active phonological cues from the input association matrix given the outcome being considered
            top_active_cues = get_top_active_cues(weight_matrix, column_id, len(nphones), ids2cues)

            # compute the Jaccard coefficient and store correct and predicted cues for every outcome
            set_inters = len(set.intersection(top_active_cues, set(nphones)))
            set_union = len(set.union(top_active_cues, set(nphones)))
            jaccard_coefficients[outcome] = set_inters / set_union
            true_cues[outcome] = nphones
            active_cues[outcome] = top_active_cues

        if idx+1 in check_points:
            print(strftime("%Y-%m-%d %H:%M:%S") +
                  ": %d%% of the outcomes have been processed to estimate the Jaccard coefficient."
                  % check_points[idx+1])

    if plots_folder:

        # check whether the provided folder path points to an existing folder,
        # and create it if it doesn't already exist
        if not os.path.isdir(plots_folder):
            os.makedirs(plots_folder)

        ranked_path = os.path.join(plots_folder, '.'.join(["_".join([f_name, 'list']), 'txt']))
        sorted_coeffs = sorted(jaccard_coefficients.items(), key=operator.itemgetter(1), reverse=True)

        scatter_path = os.path.join(plots_folder, ".".join(["_".join([f_name, 'scatter']), 'pdf']))
        plot_ranks(sorted_coeffs, output_path=scatter_path, yname='Jaccard coeff',
                   xname='Rank', figname='Jaccard coefficient for each outcome')

        # write to file each outcome together with the correct and predicted phonological cues
        with open(ranked_path, 'a+') as f_name:
            for outcome in sorted_coeffs:
                outcome = outcome[0]
                jaccard_coeff = outcome[1]
                true = true_cues[outcome]
                top_active = active_cues[outcome]
                f_name.write("\t".join([outcome, str(jaccard_coeff), str(true), str(top_active)]))
                f_name.write("\n")

    return jaccard_coefficients

Exemple #5

0

Afficher le fichier

Fichier : learning_events.py Projet : GiovanniCassani/discriminative_learning

def encode_corpus(corpus,
                  celex_dict,
                  tokens2identifiers,
                  pos_dict,
                  separator='~',
                  uni_phones=False,
                  di_phones=True,
                  tri_phones=False,
                  syllable=False,
                  stress_marker=True,
                  boundaries=False):
    """
    :param corpus:              a .json object to be used as input corpus, consisting of two aligned lists of lists,
                                meaning that a second-order list in each first order list refers to a same utterance;
                                the first list contains utterances encoded as lists of tokens, the second list contains
                                utterances encoded as lists of lemmas and PoS tags
    :param celex_dict:          the path to the Celex dictionary to be used to recode the utterances into phonetic cues
                                and lexical outcomes
    :param tokens2identifiers:  a dictionary mapping a token surface form from Celex to all token ids linked to it
    :param pos_dict:            a dictionary mapping CHILDES PoS tags to corresponding Celex PoS tags
    :param separator:           a string indicating the character separating lemmas from PoS tags in the input corpus
    :param uni_phones:          a boolean indicating whether uni-phones are relevant phonetic cues
    :param di_phones:           a boolean indicating whether di-phones are relevant phonetic cues
    :param tri_phones:          a boolean indicating whether tri-phones are relevant phonetic cues
    :param syllable:            a boolean indicating whether syllables are relevant phonetic cues
    :param stress_marker:       a boolean indicating whether to discard or not the stress marker from the Celex phonetic
                                transcriptions
    :return all_cue, all_outcomes:  two lists of lists, where each inner list contains the cues and the outcomes for
                                    each learning event respectively
    """

    total = len(corpus[0])
    check_points = {
        np.floor(total / float(100) * n): n
        for n in np.linspace(5, 100, 20)
    }

    all_cues, all_outcomes = [[], []]

    # for every utterance in the input corpus, remove words with a PoS tag that doesn't belong to the
    # dictionary of PoS mappings; then map valid words to the right PoS tag as indicated by the PoS dictionary

    for i in range(len(corpus[0])):
        words = []
        for j in range(len(corpus[0][i])):
            lemma, pos_tag = corpus[1][i][j].split(separator)
            if pos_tag in pos_dict:
                token = corpus[0][i][j]
                new_tag = pos_dict[pos_tag]
                words.append((token, new_tag, lemma))

        # if there are valid words in the utterance, encode it
        if 0 < len(words) <= 12:

            # get the phonetic encoding of the words in the current learning trial:
            # if they can all be encoded using Celex, a list is returned, other wise a tuple is
            phonological_representations = get_phonetic_encoding(
                words, celex_dict, tokens2identifiers)

            # if a phonological representation could be found for all words in the utterance, proceed
            if isinstance(phonological_representations, list):

                utterance = concatenate_phonological_representations(
                    phonological_representations, boundaries=boundaries)
                table = str.maketrans(dict.fromkeys('"'))
                utterance = utterance.translate(table)

                n_phones = encode_item(utterance,
                                       uniphones=uni_phones,
                                       diphones=di_phones,
                                       triphones=tri_phones,
                                       syllables=syllable,
                                       stress_marker=stress_marker)

                outcomes = []
                for word in words:
                    token, pos, lemma = word
                    outcomes.append('|'.join([token, lemma, pos]))

                # append the phonetic representation of the current learning event to the list of phonetic
                # representations for the whole corpus, and the lexical meanings of the current learning event to
                # the list of lexical meanings for the whole corpus
                all_cues.append(n_phones)
                all_outcomes.append(outcomes)

        if i in check_points:
            print(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": %d%% of the input corpus has been processed and encoded in the desired way."
                % check_points[i])

    return all_cues, all_outcomes