def gather_branchpoints(filepath, nrange, backwards=False, ExpectedProbability=False): ''' Compiles the transition probabilities from index.get_probs into probability distributions for branch points (prefixes). For each n in the output dict, this set of probability distribution is found at [beginnings_dict], while the other entry ['total_ngram_count'] stores the total number of ngram tokens for that n. ''' idct = index.get_probs(filepath, nrange, backwards = backwards) out_dict = {} for n in idct.keys(): ngram_dict = idct[n] ngram_count_list = [] for value in ngram_dict.values(): ngram_count_list.append(value[1]) total_ngram_count = sum(ngram_count_list) beginnings_dict = {} for ngram in ngram_dict.keys(): if ngram[:-1] not in beginnings_dict.keys(): beginnings_dict[ngram[:-1]] = [[], [], []] for ngram, prob_count in ngram_dict.items(): beginnings_dict[ngram[:-1]][0].append(ngram) beginnings_dict[ngram[:-1]][1].append(prob_count[0]) beginnings_dict[ngram[:-1]][2].append(prob_count[1]) out_dict[n] = { 'beginnings_dict': beginnings_dict, 'total_ngram_count': total_ngram_count} return out_dict
def f(filepath1, filepath2, nrange, min_dif=0.1, min_count=10): dict1 = index.get_probs(filepath1, nrange) dict2 = index.get_probs(filepath2, nrange) result = {} for n in range(nrange[0], nrange[1]): difdict = {} ndict1 = dict1[n] ndict2 = dict2[n] for ngram in ndict1.keys(): if ngram in ndict2.keys(): prob_dif = ndict1[ngram][0] - ndict2[ngram][0] count = max([ndict1[ngram][1], ndict2[ngram][1]]) if abs(prob_dif) >= min_dif and count >= min_count: difdict[ngram] = prob_dif result[n] = difdict return result
def get_ngram_counts(filepath, ngram): if type(ngram) == str: ngram = tuple(ngram) nrange = [2, len(ngram) + 2] probs_counts_dict = index.get_probs(filepath, nrange) relevant_dict = probs_counts_dict[len(ngram)] for key in relevant_dict.keys(): relevant_dict[key] = relevant_dict[key][1] if ngram in relevant_dict.keys(): result = relevant_dict[ngram] else: result = 'ngram_not_found' return result
def p_to_ent(filepath, nrange): ''' Calculates probability distributions for the songs in filepath for the nth order MMs included in nrange, then using each nth-order prob distr, calculates the entropy at each (n-1)gram. For hapax legomena, returns an H of 0. ''' idct = index.get_probs(filepath, nrange) out_dict = {} for n in idct.keys(): ngram_dict = idct[n] try: nminus1gram_dict = idct[n - 1] ngram_count_list = [] for value in ngram_dict.values(): ngram_count_list.append(value[1]) total_ngram_count = sum(ngram_count_list) beginnings_dict = {} for ngram in ngram_dict.keys(): if ngram[:-1] not in beginnings_dict.keys(): beginnings_dict[ngram[:-1]] = [[], [], []] for ngram, prob_count in ngram_dict.items(): beginnings_dict[ngram[:-1]][0].append(ngram) beginnings_dict[ngram[:-1]][1].append(prob_count[0]) beginnings_dict[ngram[:-1]][2].append(prob_count[1]) entropy_dict = {} for beginning in beginnings_dict.keys(): probabilities_list = beginnings_dict[beginning][1] counts_list = beginnings_dict[beginning][2] ngrams_list = beginnings_dict[beginning][0] entropy_terms = [] i = 0 for probability in probabilities_list: unconditional_probability = counts_list[ i] / total_ngram_count ngram = ngrams_list[i] '''Entropy formula here:''' entropy_terms.append(probability * math.log(probability, 2)) i += 1 entropy_dict[beginning] = (-1 * sum(entropy_terms), sum(counts_list)) out_dict[n] = entropy_dict except: pass return (out_dict)
def ngram_size(nrange=[1,12],directory='./data/BFs_logan/data',threshold=0): mode = 'w' for filename in os.listdir(directory): prob_distrib = index.get_probs(directory+'/'+filename,nrange) counts_vec = [] for n in prob_distrib.keys(): n_count = 0 for value in prob_distrib[n].values(): if value[1] >= threshold: n_count += value[1] counts_vec.append(n_count) row = [filename] for value in counts_vec: row.append(value) with open("./output/ngrams.csv", mode) as output_file: writer = csv.writer(output_file) writer.writerow(row) mode = 'a'
def ngram_info(fp, n, target_syllable='all', min_count=5, probs=False, backwards=False, ExpectedProbability=False): out_list = [] ngrams = index.get_probs(fp, [n, n + 1], backwards)[n] target_ngrams = [] for ngram in ngrams.keys(): if ngram[-1] == target_syllable or target_syllable == 'all': ngram_string = '' for char in ngram: ngram_string = ngram_string + char if ngram_string not in target_ngrams: target_ngrams.append(ngram_string) for ngram in target_ngrams: minilist = [ngram[:-1]] minilist.append( entropy.get_ngram_entropy( fp, ngram[:-1], backwards=backwards, ExpectedProbability=ExpectedProbability)[0]) minilist.append( entropy.get_ngram_entropy( fp, ngram[:-1], backwards=backwards, ExpectedProbability=ExpectedProbability)[1]) count = ngrams[tuple(ngram)][1] prob = ngrams[tuple(ngram)][0] if probs: minilist.append(prob) if count >= min_count: out_list.append(minilist) mode = 'w' for minilist in out_list: with open("./output/target_ngrams.csv", mode) as output_file: writer = csv.writer(output_file) writer.writerow(minilist) mode = 'a' return out_list
def avg_ent(filepath, nrange, shuffle_mode=False): ''' For each n (Markov order) in the parameter nrange, averages entropy across all n-grams, estimating the entropy rate of the songs in filepath. ''' if shuffle_mode == True: shuffle.shuffle(filepath) filepath = './output/shuffle.csv' idct = index.get_probs(filepath, nrange) ndct = p_to_ent(filepath, nrange) #print(ndct) result = {} for key, value in ndct.items(): n = key ls = [] ngram_count_list = [] nminus1gram_dict = idct[n - 1] for value in nminus1gram_dict.values(): ngram_count_list.append(value[1]) total_ngram_count = sum(ngram_count_list) for jey, ualue in value.items(): #if ualue[1]>1: ''' See equation 2.11 in Elements of Information Theory (Cover & Thomas, 2nd ed.): Add together the conditional entropies for each prefix, multiplied by the marginal entropy of that prefix. ''' prefix_marginal = nminus1gram_dict[key][1] / total_ngram_count ls.append(prefix_marginal * nminus1gram_dict) #if len(ls)>0: result[key] = sum(ls) '''else: result[key]='-''' with open("./output/entropy.csv", 'w') as output_file: writer = csv.writer(output_file) for key, value in result.items(): row = [] row.append(key) row.append(value) writer.writerow(row) return result
def batch_pe(directory='./data/BFs_logan/data', n=2, prefix='fathers_and_sons_from_logan - '): result = {} for filename in os.listdir(directory): fp = directory + '/' + filename fn_listchar = list(filename[:-4]) pf_listchar = list(prefix) ent_dict = entropy.p_to_ent(fp, [2, n + 1]) probs_dict = index.get_probs(fp, [2, n + 1]) for char in pf_listchar: fn_listchar.remove(char) bird_ID = '' for char in fn_listchar: bird_ID += char result[bird_ID] = {} for syllable in string.ascii_lowercase: try: print(get_previous_ent(syllable, n, ent_dict, probs_dict)) result[bird_ID][syllable] = get_previous_ent( syllable, n, ent_dict, probs_dict) except BaseException: pass return result