def encode_corpus(corpus_name, celex_dict, tokens2identifiers, pos_dict, separator='~', uniphones=False, diphones=True, triphones=False, syllables=False, stress_marker=True, outcomes='tokens', boundaries=False): """ :param corpus_name: the path indicating the .json file to be used as input corpus :param celex_dict: the Celex dictionary to be used to recode the utterances into phonetic cues :param tokens2identifiers: a dictionary mapping a token surface form from Celex to all token ids linked to it :param pos_dict: a dictionary mapping CHILDES PoS tags to corresponding Celex PoS tags :param separator: a string indicating the character separating lemmas from PoS tags in the input corpus :param uniphones: a boolean indicating whether uni-phones are relevant phonetic cues :param diphones: a boolean indicating whether di-phones are relevant phonetic cues :param triphones: a boolean indicating whether tri-phones are relevant phonetic cues :param syllables: a boolean indicating whether syllables are relevant phonetic cues :param stress_marker: a boolean indicating whether to discard or not the stress marker from the Celex phonetic transcriptions :param outcomes: a string indicating which outcomes to use, whether 'tokens' (default) or 'lemmas' :param boundaries: a boolean indicating whether to preserve or discard word boundaries :return encoded corpus: the input corpus recoded as a list of lists, where each inner list is a learning event and consist of two sub-lists, the first containing phonetic cues and the second containing lexical outcomes :return perc_missed: the percentage of learning events from the input corpus that could not be recoded because the Celes dictionary didn't contain the necessary information """ # get a dictionary mapping utterance indices to the percentage of corpus that has been processed up to the # utterance itself corpus = json.load(open(corpus_name, 'r+')) total = len(corpus[0]) check_points = { np.floor(total / float(100) * n): n for n in np.linspace(5, 100, 20) } encoded_corpus = [[], []] missed = 0 # for every utterance in the input corpus, remove words with a PoS tag that doesn't belong to the # dictionary of PoS mappings; then map valid words to the right PoS tag as indicated by the PoS dictionary for i in range(len(corpus[0])): words = [] for j in range(len(corpus[0][i])): lemma, pos_tag = corpus[1][i][j].split(separator) if pos_tag in pos_dict: token = corpus[0][i][j] new_tag = pos_dict[pos_tag] words.append((token, new_tag, lemma)) # if there are valid words in the utterance, encode it if 0 < len(words) <= 20: # get the phonetic encoding of the words in the current learning trial: # if they can all be encoded using Celex, a list is returned, other wise a tuple is phonological_representations = get_phonetic_encoding( words, celex_dict, tokens2identifiers) # if a phonological representation could be found for all words in the utterance, proceed if isinstance(phonological_representations, list): utterance = concatenate_phonological_representations( phonological_representations) table = str.maketrans(dict.fromkeys('"')) utterance = utterance.translate(table) n_phones = encode_item(utterance, stress_marker=stress_marker, boundaries=boundaries, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables) outcomes_set = set() for word in words: token, pos, lemma = word if outcomes == 'tokens': outcomes_set.add('|'.join([token, pos])) elif outcomes == 'lemmas': outcomes_set.add('|'.join([lemma, pos])) else: raise ValueError( "Unrecognized specification concerning lexical outcomes. " "Please, choose either 'tokens' or 'lemmas'.") # append the phonetic representation of the current learning event to the list of phonetic # representations for the whole corpus, and the lexical meanings of the current learning event to # the list of lexical meanings for the whole corpus encoded_corpus[0].append(n_phones) encoded_corpus[1].append(list(outcomes_set)) # if the phonological representation of a word from the utterance could not be retrieved from # CELEX, count the utterance as missed else: missed += 1 if i in check_points: print( strftime("%Y-%m-%d %H:%M:%S") + ": %d%% of the input corpus has been processed and encoded in the desired way." % check_points[i]) perc_missed = missed / float(total) * 100 return encoded_corpus, perc_missed
def categorize(test_items, weights_matrix, cues2ids, outcomes2ids, method='freq', evaluation='count', stats=False, k=50, flush=0, threshold=0, uniphones=True, diphones=False, triphones=False, syllables=False, stress_marker=False, boundaries=True): """ :param test_items: an iterable containing strings. Each string is the phonological form of a word together with its PoS tag, separated by a vertical bar ('|') :param weights_matrix: a NumPy array containing the matrix of cue-outcome associations estimated via the ndl module; rows represent cues, columns represent outcomes. :param cues2ids: a Python dictionary mapping cues to row indices in the weight matrix :param outcomes2ids: a Python dictionary mapping outcomes to column indices in the weight matrix :param method: a string indicating the way in which the function looks at top active outcomes; two options are available: - 'freq' makes the function compute the distribution of PoS tags over the k top active nodes (see the explanation of the parameter k) and rank PoS tags according to their frequency among the top active cues - 'sum' makes the function compute the sum of activation from all outcomes belonging to a given PoS tag within the k top active outcomes given the input cues, and rank PoS tags according to their total activation among the top active cues :param evaluation: a string indicating how to compare baseline activations to item-triggered ones; two options are available: - 'count', simply tag the test item with the PoS tag that either was more frequent or had highest summed activation within the top active outcomes; frequency or activation are returned and can be correlated to reaction times - 'distr', compare the frequency counts or summed activations generated by a specific test item to the frequency counts or summed activations at baseline and tag the test item with the PoS tag receiving highest support by the change in the distribution of frequencies or summed activations (a statistic is returned, Chi-squared for frequency distributions and t-test for summed activations, whose value can be correlated to reaction times) :param stats: if True, makes the function assign a PoS tag to a test item based on the result of a statistical test (Chi-squared for frequencies, t-test for activations): with the Chi-squared, the PoS tag is chosen whose Pearson standardised residual is highest for the item-triggered frequency distribution; with the t-test, the PoS tag is chosen that... if False, the PoS tag with the highest positive difference between item-triggered and baseline frequency/activation is chosen CAVEAT: this parameter only makes sense if the 'distr' option is chosen for the 'evaluation' parameter :param k: an integer specifying how many elements to consider from the baseline activations and the activations triggered by a specific test item. By default, the top 50 outcomes are considered, and compared according to the chosen combination of method and eval :param flush: specify whether (and how many) top active outcome at baseline to flush away from subsequent computations. It may be the case that whatever the input cues, the same high frequency outcomes come out as being the most active. It may then make sense to not consider them when evaluating the distribution of lexical categories over the most active outcomes given an input item :param threshold: the minimum activation of an outcome to be considered in the list of top activated neighbors, default is 0 and shouldn't be lowered, but can be increased. :param uniphones: a boolean indicating whether single phonemes are to be considered while encoding input utterances :param diphones: a boolean indicating whether sequences of two phonemes are to be considered while encoding input utterances :param triphones: a boolean indicating whether sequences of three phonemes are to be considered while encoding input utterances :param syllables: a boolean indicating whether syllables are to be considered while encoding input utterances :param stress_marker: a boolean indicating whether stress markers from the input phonological representation need to be preserved or can be discarded :param boundaries: a boolean specifying whether word boundaries are to be preserved or discarded :return f1: the proportion of items from the input iterable that could be categorized correctly - the PoS tag of the words receiving highest activation given the phonetic cues in each test item matched the PoS tag attached to the test item itself :return h: the normalized entropy of the distribution of PoS tags chosen by the model when tagging test items. If a model always chooses the same PoS tag, the entropy will be minimal; ideally, a good model doesn't behave like a majority baseline model, even though this might result in high accuracy scores :return pos: the PoS tag that is applied most frequently by a model. Useful to spot anomalous over-extension of a PoS tag :return freq: the frequency with which the most frequent PoS tag applied by the model is actually applied """ to_filter = set() baseline_activations = compute_outcomes_activations( cues2ids.keys(), weights_matrix, cues2ids, outcomes2ids, to_filter) sorted_baseline_activations = sorted(baseline_activations.items(), key=operator.itemgetter(1), reverse=True) # if top active outcomes at baseline need to be flushed away, store flushed outcomes in a set and store the other # outcomes in a list of tuples if flush: to_filter = { outcome[0] for outcome in sorted_baseline_activations[:flush] } sorted_baseline_activations = sorted_baseline_activations[flush:] # compute baseline frequency distribution over PoS tags and PoS summed activation over the k most active outcomes # given all input cues at once pos_freq_baseline, pos_act_baseline = get_frequency_and_activation_for_each_pos( sorted_baseline_activations[:k]) hits = 0 total = 0 total_items = len(test_items) check_points = { int(np.floor(total_items / 100 * n)): n for n in np.linspace(5, 100, 20) } log_dict = defaultdict(dict) tags = set() for item in test_items: tags.add(item.split("|")[1]) if not isinstance(item, str): ValueError( "The input items must consist of strings: check your input file!" ) # split the test token from its Part-of-Speech and encode it in nphones word, target_pos = item.split('|') if boundaries: word = '+' + word + '+' nphones = encode_item(word, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, stress_marker=stress_marker) # compute outcome activations given the phonetic cues in the test item, sort the outcomes by outcome activation # value and pick the top k outcome_activations = compute_outcomes_activations( nphones, weights_matrix, cues2ids, outcomes2ids, to_filter) sorted_outcome_activations = sorted(outcome_activations.items(), key=operator.itemgetter(1), reverse=True)[:k] if sorted_outcome_activations[0][1] == 0: # if the activation of the first item is 0 it means that no phonetic cue from the test item was ever # encountered in the corpus, and it is thus impossible to estimate which are most active outcomes given # the phonetic cues of which the test item consists top_pos, value, n_freq, v_freq, n_act, v_act = ('-', 0, 0, 0, 0, 0) else: # make sure that most active outcomes all have activation values higher than the indicated threshold if not sorted_outcome_activations[-1][1] > threshold: sorted_outcome_activations = [ (outcome, act) for outcome, act in sorted_outcome_activations if act > threshold ] # compute frequency distribution over PoS tags and PoS summed activation over the k most active outcomes # given the test item, then get how many verbs and nouns there were among to k top active outcomes pos_freq_item, pos_act_item = get_frequency_and_activation_for_each_pos( sorted_outcome_activations) n_freq = pos_freq_item['N'] v_freq = pos_freq_item['V'] n_act = pos_act_item['N'] v_act = pos_act_item['V'] # get the most likely PoS tag for the test item given the k top active outcomes top_pos, value = pick_pos(pos_freq_item, pos_act_item, pos_freq_baseline, pos_act_baseline, evaluation=evaluation, method=method, stats=stats) # print the test item (with the correct PoS tag), the PoS tag assigned by the model, the statistic computed to # assign the chosen PoS (frequency/activation count or difference) and the k top active nodes given the test # item log_dict[item] = { 'predicted': top_pos, 'value': value, 'n_freq': n_freq, 'v_freq': v_freq, 'n_act': n_act, 'v_act': v_act, 'items': {k: v for (k, v) in sorted_outcome_activations} } # compare the predicted and true PoS tag and increment the count of hits if they match if top_pos == target_pos: hits += 1 total += 1 if total in check_points: print( strftime("%Y-%m-%d %H:%M:%S") + ": %d%% of the test items have been processed." % check_points[total]) return log_dict
def map_phonology(corpus_file, mapping_file, output_file, celex_dir, compounds=True, reduced=False, minimalist=True): """ :param corpus_file: the path to a .txt file containing one utterance per line, with all words in the utterance separated by a comma and each word being a tuple consisting of four pipe-separated elements, token|lemma|PoS1|PoS2 where PoS1 is the coarse Celex tag and PoS2 is the tag provided by the TreeTagger :param mapping_file: the path to a .txt file where the output of the process will be written to :param output_file: the path to a .txt file where the lines from the input will be rewritten as comma-separated sequences of pipe-separated 5-tuples consisting of token, lemma, pos1, pos2, 3phones :param celex_dir: the directory where the Celex dictionary is to be found :param compounds: a boolean. If true, all entries in Celex are considered; if False, entries which contain spaces are discarded :param minimalist: a boolean. It specifies whether lemmas in the output file should be differentiated when their phonetic realization changes depending on the part of speech: if minimalist is True, lemmas are not differentiated (default), if it is False, lemmas are differentiated by appending pos1 to the lemma, separated by a colon :return mapping: a dictionary mapping 4-tuples token|lemma|pos1|pos2 to the matching triphones """ celex_dict = get_celex_dictionary(celex_dir, reduced=reduced, compounds=compounds) tokens2identifiers = tokens2ids(celex_dict) mapping = {} lemma2phon = defaultdict(dict) new_corpus = [] with open(corpus_file, 'r') as fr: for line in fr: words = line.strip().split(',') new_line = [] for word in words: if word: try: token, lemma, pos1, pos2 = word.split('|') except ValueError: token, lemma, pos1 = word.split('|') pos2 = 'NN' if pos1 == 'N' else pos1 new_token, new_lemma = adjust_apostrophes(token, lemma) new_token = new_token.replace('=', '_') new_lemma = new_lemma.replace('=', '_') token_phonological_form = get_phonetic_encoding([(new_token, pos1, new_lemma)], celex_dict, tokens2identifiers) lemma_phonology = get_phonetic_encoding([(new_lemma, pos1, new_lemma)], celex_dict, tokens2identifiers) lemma_phonological_form = ''.join(lemma_phonology) if isinstance(lemma_phonology, list) else \ ''.join(token_phonological_form) if isinstance(token_phonological_form, list): triphones = encode_item(token_phonological_form[0], triphones=True, stress_marker=True, uniphones=False, diphones=False, syllables=False) deriv = code_derivational_morphology(pos2) output_token = token.replace('_', '=') output_lemma = lemma.replace('_', '=') morpho = 'COMPOUND' if '=' in output_token else 'MONO' key = '|'.join([output_token, output_lemma, pos1, pos2, deriv, morpho, ':'.join([output_token, pos1])]) output_triphones = ';'.join(triphones) mapping[key] = output_triphones if lemma_phonological_form in lemma2phon[output_lemma]: lemma2phon[output_lemma][lemma_phonological_form].add(pos1) else: lemma2phon[output_lemma][lemma_phonological_form] = {pos1} new_line.append((output_token, ':'.join([output_lemma, pos1]), pos1, pos2, output_triphones)) new_corpus.append(new_line) write_mapping_file(mapping, mapping_file) write_output_corpus(new_corpus, output_file, lemma2phon, minimalist=minimalist) return mapping
def jaccard(weight_matrix, cues2ids, outcomes2ids, celex_dict, plots_folder='', stress_marker=True, uniphone=False, diphone=False, triphone=True, syllable=False, boundaries=True): """ :param weight_matrix: the matrix of cue-outcome association estimated using the ndl model :param cues2ids: a dictionary mapping strings to row indices in the weight_matrix :param outcomes2ids: a dictionary mapping strings to column indices in the weight_matrix :param celex_dict: the dictionary extracted from the celex database :param plots_folder: a string indicating the path to a folder, where all the plots and files generated by the function will be stored. The function checks if the folder already exists, and if it doesn't the function creates it :param uniphone: a boolean indicating whether single phonemes are to be considered while encoding column identifiers :param diphone: a boolean indicating whether sequences of two phonemes are to be considered while encoding column identifiers :param triphone: a boolean indicating whether sequences of three phonemes are to be considered while encoding column identifiers :param syllable: a boolean indicating whether syllables are to be considered while encoding column identifiers :param stress_marker: a boolean indicating whether stress markers from the phonological representations of Celex need to be preserved or can be discarded :param boundaries: a boolean specifying whether to consider or not word boundaries :return jaccard_coefficients: a dictionary mapping outcome surface forms (strings) to the Jaccard coefficient computed between the gold-standard and most active cues as estimated from the input matrix. Gold-standard cues are extracted from the outcome phonological form according the specified encoding; moreover, a vector of length k (where k is the number of gold-standard cues) is filled with the top k cues for the outcome being considered, looking at raw activation values. `The Jaccard coefficient is the proportion between the intersection of the two vectors and their union, telling how many cues are shared proportionally to how many unique cues there are. The higher the number, the higher the overlap and the better the network was able to discriminate the good cues for an outcome. """ # specify the string that identifies plots generated by this function in their file names f_name = 'jaccard' token_indices = tokens2ids(celex_dict) jaccard_coefficients = {} true_cues = {} active_cues = {} ids2cues = dict(zip(cues2ids.values(), cues2ids.keys())) total_items = len(outcomes2ids) check_points = {int(np.floor(total_items / 100 * n)): n for n in np.linspace(5, 100, 20)} # consider each outcome separately for idx, outcome in enumerate(outcomes2ids): column_id = outcomes2ids[outcome] wordform, pos = outcome.split('|') celex_entry = (wordform, pos, wordform) word_phon = get_phonological_form(celex_entry, celex_dict, token_indices) if boundaries: word_phon = '+' + word_phon + '+' if isinstance(word_phon, str): # get the relevant phonological cues nphones = encode_item(word_phon, stress_marker=stress_marker, uniphones=uniphone, diphones=diphone, triphones=triphone, syllables=syllable) # get the top active phonological cues from the input association matrix given the outcome being considered top_active_cues = get_top_active_cues(weight_matrix, column_id, len(nphones), ids2cues) # compute the Jaccard coefficient and store correct and predicted cues for every outcome set_inters = len(set.intersection(top_active_cues, set(nphones))) set_union = len(set.union(top_active_cues, set(nphones))) jaccard_coefficients[outcome] = set_inters / set_union true_cues[outcome] = nphones active_cues[outcome] = top_active_cues if idx+1 in check_points: print(strftime("%Y-%m-%d %H:%M:%S") + ": %d%% of the outcomes have been processed to estimate the Jaccard coefficient." % check_points[idx+1]) if plots_folder: # check whether the provided folder path points to an existing folder, # and create it if it doesn't already exist if not os.path.isdir(plots_folder): os.makedirs(plots_folder) ranked_path = os.path.join(plots_folder, '.'.join(["_".join([f_name, 'list']), 'txt'])) sorted_coeffs = sorted(jaccard_coefficients.items(), key=operator.itemgetter(1), reverse=True) scatter_path = os.path.join(plots_folder, ".".join(["_".join([f_name, 'scatter']), 'pdf'])) plot_ranks(sorted_coeffs, output_path=scatter_path, yname='Jaccard coeff', xname='Rank', figname='Jaccard coefficient for each outcome') # write to file each outcome together with the correct and predicted phonological cues with open(ranked_path, 'a+') as f_name: for outcome in sorted_coeffs: outcome = outcome[0] jaccard_coeff = outcome[1] true = true_cues[outcome] top_active = active_cues[outcome] f_name.write("\t".join([outcome, str(jaccard_coeff), str(true), str(top_active)])) f_name.write("\n") return jaccard_coefficients
def encode_corpus(corpus, celex_dict, tokens2identifiers, pos_dict, separator='~', uni_phones=False, di_phones=True, tri_phones=False, syllable=False, stress_marker=True, boundaries=False): """ :param corpus: a .json object to be used as input corpus, consisting of two aligned lists of lists, meaning that a second-order list in each first order list refers to a same utterance; the first list contains utterances encoded as lists of tokens, the second list contains utterances encoded as lists of lemmas and PoS tags :param celex_dict: the path to the Celex dictionary to be used to recode the utterances into phonetic cues and lexical outcomes :param tokens2identifiers: a dictionary mapping a token surface form from Celex to all token ids linked to it :param pos_dict: a dictionary mapping CHILDES PoS tags to corresponding Celex PoS tags :param separator: a string indicating the character separating lemmas from PoS tags in the input corpus :param uni_phones: a boolean indicating whether uni-phones are relevant phonetic cues :param di_phones: a boolean indicating whether di-phones are relevant phonetic cues :param tri_phones: a boolean indicating whether tri-phones are relevant phonetic cues :param syllable: a boolean indicating whether syllables are relevant phonetic cues :param stress_marker: a boolean indicating whether to discard or not the stress marker from the Celex phonetic transcriptions :return all_cue, all_outcomes: two lists of lists, where each inner list contains the cues and the outcomes for each learning event respectively """ total = len(corpus[0]) check_points = { np.floor(total / float(100) * n): n for n in np.linspace(5, 100, 20) } all_cues, all_outcomes = [[], []] # for every utterance in the input corpus, remove words with a PoS tag that doesn't belong to the # dictionary of PoS mappings; then map valid words to the right PoS tag as indicated by the PoS dictionary for i in range(len(corpus[0])): words = [] for j in range(len(corpus[0][i])): lemma, pos_tag = corpus[1][i][j].split(separator) if pos_tag in pos_dict: token = corpus[0][i][j] new_tag = pos_dict[pos_tag] words.append((token, new_tag, lemma)) # if there are valid words in the utterance, encode it if 0 < len(words) <= 12: # get the phonetic encoding of the words in the current learning trial: # if they can all be encoded using Celex, a list is returned, other wise a tuple is phonological_representations = get_phonetic_encoding( words, celex_dict, tokens2identifiers) # if a phonological representation could be found for all words in the utterance, proceed if isinstance(phonological_representations, list): utterance = concatenate_phonological_representations( phonological_representations, boundaries=boundaries) table = str.maketrans(dict.fromkeys('"')) utterance = utterance.translate(table) n_phones = encode_item(utterance, uniphones=uni_phones, diphones=di_phones, triphones=tri_phones, syllables=syllable, stress_marker=stress_marker) outcomes = [] for word in words: token, pos, lemma = word outcomes.append('|'.join([token, lemma, pos])) # append the phonetic representation of the current learning event to the list of phonetic # representations for the whole corpus, and the lexical meanings of the current learning event to # the list of lexical meanings for the whole corpus all_cues.append(n_phones) all_outcomes.append(outcomes) if i in check_points: print( strftime("%Y-%m-%d %H:%M:%S") + ": %d%% of the input corpus has been processed and encoded in the desired way." % check_points[i]) return all_cues, all_outcomes