Ejemplo n.º 1
0
def gather_branchpoints(filepath, nrange, backwards=False, ExpectedProbability=False):
    '''
    Compiles the transition probabilities from index.get_probs into probability distributions
    for branch points (prefixes). For each n in the output dict, this set of probability distribution
    is found at [beginnings_dict], while the other entry ['total_ngram_count'] stores the total number
    of ngram tokens for that n.
    '''
    idct = index.get_probs(filepath, nrange, backwards = backwards)
    out_dict = {}
    for n in idct.keys():
        ngram_dict = idct[n]
        ngram_count_list = []
        for value in ngram_dict.values():
            ngram_count_list.append(value[1])
        total_ngram_count = sum(ngram_count_list)
        beginnings_dict = {}
        for ngram in ngram_dict.keys():
            if ngram[:-1] not in beginnings_dict.keys():
                beginnings_dict[ngram[:-1]] = [[], [], []]
        for ngram, prob_count in ngram_dict.items():
            beginnings_dict[ngram[:-1]][0].append(ngram)
            beginnings_dict[ngram[:-1]][1].append(prob_count[0])
            beginnings_dict[ngram[:-1]][2].append(prob_count[1])
        out_dict[n] = {
            'beginnings_dict': beginnings_dict,
            'total_ngram_count': total_ngram_count}
    return out_dict
Ejemplo n.º 2
0
def f(filepath1, filepath2, nrange, min_dif=0.1, min_count=10):
    dict1 = index.get_probs(filepath1, nrange)
    dict2 = index.get_probs(filepath2, nrange)
    result = {}
    for n in range(nrange[0], nrange[1]):
        difdict = {}
        ndict1 = dict1[n]
        ndict2 = dict2[n]
        for ngram in ndict1.keys():
            if ngram in ndict2.keys():
                prob_dif = ndict1[ngram][0] - ndict2[ngram][0]
                count = max([ndict1[ngram][1], ndict2[ngram][1]])
                if abs(prob_dif) >= min_dif and count >= min_count:
                    difdict[ngram] = prob_dif
        result[n] = difdict
    return result
def get_ngram_counts(filepath, ngram):
    if type(ngram) == str:
        ngram = tuple(ngram)
    nrange = [2, len(ngram) + 2]
    probs_counts_dict = index.get_probs(filepath, nrange)
    relevant_dict = probs_counts_dict[len(ngram)]
    for key in relevant_dict.keys():
        relevant_dict[key] = relevant_dict[key][1]
    if ngram in relevant_dict.keys():
        result = relevant_dict[ngram]
    else:
        result = 'ngram_not_found'
    return result
def p_to_ent(filepath, nrange):
    '''
    Calculates probability distributions for the songs in filepath for the nth order MMs included in nrange,
    then using each nth-order prob distr, calculates the entropy at each (n-1)gram.
    For hapax legomena, returns an H of 0.
    '''
    idct = index.get_probs(filepath, nrange)
    out_dict = {}
    for n in idct.keys():
        ngram_dict = idct[n]
        try:
            nminus1gram_dict = idct[n - 1]
            ngram_count_list = []
            for value in ngram_dict.values():
                ngram_count_list.append(value[1])
            total_ngram_count = sum(ngram_count_list)
            beginnings_dict = {}
            for ngram in ngram_dict.keys():
                if ngram[:-1] not in beginnings_dict.keys():
                    beginnings_dict[ngram[:-1]] = [[], [], []]
            for ngram, prob_count in ngram_dict.items():
                beginnings_dict[ngram[:-1]][0].append(ngram)
                beginnings_dict[ngram[:-1]][1].append(prob_count[0])
                beginnings_dict[ngram[:-1]][2].append(prob_count[1])
            entropy_dict = {}
            for beginning in beginnings_dict.keys():
                probabilities_list = beginnings_dict[beginning][1]
                counts_list = beginnings_dict[beginning][2]
                ngrams_list = beginnings_dict[beginning][0]
                entropy_terms = []
                i = 0
                for probability in probabilities_list:
                    unconditional_probability = counts_list[
                        i] / total_ngram_count
                    ngram = ngrams_list[i]
                    '''Entropy formula here:'''
                    entropy_terms.append(probability *
                                         math.log(probability, 2))
                    i += 1
                entropy_dict[beginning] = (-1 * sum(entropy_terms),
                                           sum(counts_list))
            out_dict[n] = entropy_dict
        except:
            pass
    return (out_dict)
Ejemplo n.º 5
0
def ngram_size(nrange=[1,12],directory='./data/BFs_logan/data',threshold=0):
    mode = 'w'
    for filename in os.listdir(directory):
        prob_distrib = index.get_probs(directory+'/'+filename,nrange)
        counts_vec = []
        for n in prob_distrib.keys():
            n_count = 0
            for value in prob_distrib[n].values():
                if value[1] >= threshold:
                    n_count += value[1]
            counts_vec.append(n_count)
        row = [filename]
        for value in counts_vec:
            row.append(value)
        with open("./output/ngrams.csv", mode) as output_file:
            writer = csv.writer(output_file)
            writer.writerow(row)
        mode = 'a'
Ejemplo n.º 6
0
def ngram_info(fp,
               n,
               target_syllable='all',
               min_count=5,
               probs=False,
               backwards=False,
               ExpectedProbability=False):
    out_list = []
    ngrams = index.get_probs(fp, [n, n + 1], backwards)[n]
    target_ngrams = []
    for ngram in ngrams.keys():
        if ngram[-1] == target_syllable or target_syllable == 'all':
            ngram_string = ''
            for char in ngram:
                ngram_string = ngram_string + char
            if ngram_string not in target_ngrams:
                target_ngrams.append(ngram_string)
    for ngram in target_ngrams:
        minilist = [ngram[:-1]]
        minilist.append(
            entropy.get_ngram_entropy(
                fp,
                ngram[:-1],
                backwards=backwards,
                ExpectedProbability=ExpectedProbability)[0])
        minilist.append(
            entropy.get_ngram_entropy(
                fp,
                ngram[:-1],
                backwards=backwards,
                ExpectedProbability=ExpectedProbability)[1])
        count = ngrams[tuple(ngram)][1]
        prob = ngrams[tuple(ngram)][0]
        if probs:
            minilist.append(prob)
        if count >= min_count:
            out_list.append(minilist)
    mode = 'w'
    for minilist in out_list:
        with open("./output/target_ngrams.csv", mode) as output_file:
            writer = csv.writer(output_file)
            writer.writerow(minilist)
        mode = 'a'
    return out_list
def avg_ent(filepath, nrange, shuffle_mode=False):
    '''
    For each n (Markov order) in the parameter nrange, averages entropy 
    across all n-grams, estimating the entropy rate of the songs in filepath.
    '''
    if shuffle_mode == True:
        shuffle.shuffle(filepath)
        filepath = './output/shuffle.csv'
    idct = index.get_probs(filepath, nrange)
    ndct = p_to_ent(filepath, nrange)
    #print(ndct)
    result = {}
    for key, value in ndct.items():
        n = key
        ls = []
        ngram_count_list = []
        nminus1gram_dict = idct[n - 1]
        for value in nminus1gram_dict.values():
            ngram_count_list.append(value[1])
        total_ngram_count = sum(ngram_count_list)
        for jey, ualue in value.items():
            #if ualue[1]>1:
            '''
            See equation 2.11 in Elements of Information Theory (Cover & Thomas, 2nd ed.):
            Add together the conditional entropies for each prefix, multiplied by the marginal entropy of that prefix.
            '''
            prefix_marginal = nminus1gram_dict[key][1] / total_ngram_count
            ls.append(prefix_marginal * nminus1gram_dict)
            #if len(ls)>0:
            result[key] = sum(ls)
        '''else:
            result[key]='-'''
    with open("./output/entropy.csv", 'w') as output_file:
        writer = csv.writer(output_file)
        for key, value in result.items():
            row = []
            row.append(key)
            row.append(value)
            writer.writerow(row)
    return result
Ejemplo n.º 8
0
def batch_pe(directory='./data/BFs_logan/data', n=2,
             prefix='fathers_and_sons_from_logan - '):
    result = {}
    for filename in os.listdir(directory):
        fp = directory + '/' + filename
        fn_listchar = list(filename[:-4])
        pf_listchar = list(prefix)
        ent_dict = entropy.p_to_ent(fp, [2, n + 1])
        probs_dict = index.get_probs(fp, [2, n + 1])
        for char in pf_listchar:
            fn_listchar.remove(char)
        bird_ID = ''
        for char in fn_listchar:
            bird_ID += char
        result[bird_ID] = {}
        for syllable in string.ascii_lowercase:
            try:
                print(get_previous_ent(syllable, n, ent_dict, probs_dict))
                result[bird_ID][syllable] = get_previous_ent(
                    syllable, n, ent_dict, probs_dict)
            except BaseException:
                pass
    return result