Python Sentences.Sentences Examples

Programming Language: Python

Namespace/Package Name: data.corpus

Class/Type: Sentences

Method/Function: Sentences

Examples at hotexamples.com: 7

Python Sentences.Sentences - 7 examples found. These are the top rated real world Python examples of data.corpus.Sentences.Sentences extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

subsample(8)

Sentences(7)

Frequently Used Methods

subsample (8)

Sentences (7)

Example #1

Show file

def heap(corp, rng):
    vocab_sizes = []
    for ntoks in rng:
        subsample = Sentences(sent_subsample(corp, ntoks))
        vocab_size = compute_vocab_size(subsample)
        vocab_sizes.append(vocab_size)
    return vocab_sizes

Example #2

Show file

def get_filters(filter_dir, k, names, param_name, param_ls):
    filters_dict = {}
    
    for param in param_ls:
        all_samples = corpora_from_pickles(filter_dir, names=names)
        cur_param_filters = [Sentences(c) for name_d, c in all_samples if 
                             name_d["k"] == k and name_d[param_name] == param]
        filters_dict[param] = cur_param_filters
        
    return filters_dict

Example #3

Show file

 def filter_worker(i):
     print("started ", i)
     cur_seed = int.from_bytes(os.urandom(4), byteorder='little')
     rand.seed(cur_seed)
     filtered = list(filter_typicality_incremental(mp_array, zipf_model, 
                     rank_dict, auto_typ, n, factor*epsilon_f_minus, lt))
     filtered_freqs = compute_freqs(Sentences(filtered))
     print("filtered ", i, " typicality: ", 
           typicality(zipf_model, merge_to_joint(rank_dict, filtered_freqs)))
     
     name = "_".join((str(n), str(factor), str(i)))
     corpus_to_pickle(filtered, "results/" + lang + "/TF", name)

Example #4

Show file

    setup_m = 100
    m = 10
    
    wiki = list(wiki_from_pickles("data/"+lang+"_pkl"))
    sents = [s for a in wiki for s in a]

    zipf_model, rank_dict, mean_typ, std_typ, auto_typ = setup_filtering(wiki, 
                                                                         big_n(wiki), 
                                                                         n, 
                                                                         setup_m)
    
    mean_corrected = abs(mean_typ - auto_typ)
    epsilon_f_plus = mean_corrected + std_typ*factor
    epsilon_f_minus = - epsilon_f_plus
    
    print("\nModel and Epsilon established")
    print(auto_typ, mean_typ, std_typ)
    print(epsilon_f_minus, epsilon_f_plus)
    
    
    for m_i in range(m):
        print("started ", m_i)        
        filtered = list(filter_typicality_incremental(sents, zipf_model, 
                        rank_dict, auto_typ, n, epsilon_f_minus, lt))
        filtered_freqs = compute_freqs(Sentences(filtered))
        print("filtered ", m_i, " typicality: ", 
              typicality(zipf_model, merge_to_joint(rank_dict, filtered_freqs)))

        
        name = "_".join((str(n), str(factor), str(m_i)))
        corpus_to_pickle(filtered, "results/" + lang + "/TF", name)

Example #5

Show file

    print("ARGS: ", lang, factors, hist_lens, "\n")
    d = "results/" + lang + "/"
    results_d = d + "evaluation/"

    k = 1000000

    srfs = get_filters(d + "SRF/", k, ["k", "h", "i"], "h", hist_lens)
    tfs = get_filters(d + "TF/", k, ["k", "f", "i"], "f", factors)

    highest_three_factors = factors[-3:]
    three_tfs = {k: tfs[k] for k in highest_three_factors}
    highest_three_hist_lens = hist_lens[-3:]
    three_srfs = {k: srfs[k] for k in highest_three_hist_lens}

    unis = [
        Sentences(c)
        for _, c in corpora_from_pickles(d + "UNI", names=["k", "i"])
    ]

    uni_mean_ranks, uni_mean_freqs = mean_rank_freq_from_samples(unis)
    uni_joints = merge_to_joint(uni_mean_ranks, uni_mean_freqs)
    uni_xs, uni_ys = list(zip(*sorted(uni_joints.values())))

    print("filters loaded", flush=True)

    # MLEs
    tf_mles, srf_mles, uni_mandel = do_mles(tfs, srfs, unis)

    with open(results_d + "mle_mandelbrot.txt", "w") as handle:
        for param, mandel in tf_mles.items():
            handle.write("TF " + str(param))

Example #6

Show file

if __name__ == "__main__":
    n = 100000
    d = "results/ALS/"

    # GET UNIVERSE
    wiki = list(wiki_from_pickles("data/ALS_pkl"))
    sent_d, label_f = number_sents((s for a in wiki for s in a))
    word_d, word_label_f = number_words((w for a in wiki for s in a
                                         for w in s))

    ## LOAD CORPORA
    # SRFs
    srf_samples = list(corpora_from_pickles(d + "SRF", names=["n", "h", "i"]))
    srf_10 = [
        Sentences(c) for name_d, c in srf_samples
        if name_d["n"] == n and name_d["h"] == 10
    ]
    srf_20 = [
        Sentences(c) for name_d, c in srf_samples
        if name_d["n"] == n and name_d["h"] == 20
    ]
    srf_30 = [
        Sentences(c) for name_d, c in srf_samples
        if name_d["n"] == n and name_d["h"] == 30
    ]
    #TFs
    tf_samples = list(corpora_from_pickles(d + "TF", names=["n", "f", "i"]))
    tf_50 = [
        Sentences(c) for name_d, c in tf_samples
        if name_d["n"] == n and name_d["f"] == 50

Example #7

Show file

    c_vec1, c_vec2 = [cs1[x] for x in sorted(universe)], [cs2[x] for x in sorted(universe)]
    return sum(min(one, two) for one, two in zip(c_vec1, c_vec2))/sum(max(one, two) for one, two in zip(c_vec1, c_vec2))




if __name__ == "__main__":
    n = 100000
    d = "results/ALS/"
    
    wiki = list(wiki_from_pickles("data/ALS_pkl"))
    print("Total num sents", len([s for a in wiki for s in a]))
    
    
    srf_samples = corpora_from_pickles(d + "SRF", names=["n", "h", "i"])
    srf_30 = [Sentences(c) for name_d, c in srf_samples if name_d["n"] == n and 
                                                  name_d["h"] == 30]
    
    
    tf_samples = corpora_from_pickles(d + "TF", names=["n", "f", "i"])
    tf_100 = [Sentences(c) for name_d, c in tf_samples if name_d["n"] == n and 
                                                  name_d["f"] == 100]    
    
    uni_samples = corpora_from_pickles(d + "UNI", names=["n", "i"])
    uni = [Sentences(c) for name_d, c in uni_samples if name_d["n"] == n]
    
    
    for subcorp_set, name in zip([srf_30, tf_100, uni], ["SRF", "TF", "UNI"]):
        print("\n", name)
        
        shuffled_sents = rand.permutation([s for subcorp in subcorp_set