コード例 #1
0
def test():

    alice_data = map_words(read_file('pg11.txt', open_, encoding='utf-8'))
    jekyll_data = map_words(read_file('pg42.txt', open_, encoding='utf-8'))

    alice_size = dataset_size(alice_data)
    jekyll_size = dataset_size(jekyll_data)

    words = shared_words([alice_data, jekyll_data])


    """print words
    print len(words)
    print len(alice_data)
    print len(jekyll_data)
    print sum(len(word) for word in words)  / len(words) + 0.0
    print sum(len(word) for word in alice_data)  / (len(alice_data) + 0.0)
    print sum(len(word) for word in jekyll_data)  / (len(jekyll_data) + 0.0)

    print perfect_matches(absolute_ratings(words, alice_data, jekyll_data))"""

    near_value = 0.001
    all_near_usage = all_near_usage_value(usage_ratings(words, alice_data, jekyll_data), near_value)
    """print all_near
    print len(all_near)"""

    absolute = absolute_ratings(words, alice_data, jekyll_data)
    print perfect_matches(absolute)
    all_near_absolute = all_near_absolute_value(absolute, near_value)
    
    #print len(all_near_absolute)

    for word in all_near_usage:
        if word in perfect_matches(absolute):
            print word
コード例 #2
0
def export_all():

    data = {}
    
    for f in listdir('.'):
        if f.endswith('.txt') and 'all' not in f:
            print f
            try:
                read_data = read_file(f, codecs.open, encoding='utf-8')
                map_words(read_data, data)
            except UnicodeDecodeError:
                pass
            

    export_data(data, 'all2.csv', codecs.open, encoding='utf-8')
コード例 #3
0
        if f.endswith('.txt') and 'all' not in f:
            print f
            try:
                read_data = read_file(f, codecs.open, encoding='utf-8')
                map_words(read_data, data)
            except UnicodeDecodeError:
                pass
            

    export_data(data, 'all2.csv', codecs.open, encoding='utf-8')


if __name__ == '__main__':

    """
    data = read_file('svenska.txt', codecs.open, encoding='utf-8')
    data = map_words(data)
    export_data(data,'svenska.csv', codecs.open, encoding='utf-8')
    """

    alice_data = map_words(read_file('pg11.txt', codecs.open, encoding='utf-8'))
    jekyll_data = map_words(read_file('pg42.txt', codecs.open, encoding='utf-8'))

    words = shared_words([alice_data, jekyll_data])

    absolute = absolute_ratings(words, alice_data, jekyll_data)
    usage = usage_ratings(words, alice_data, jekyll_data)

    export_shared_words(words, usage, absolute, 'output_alice_jekyll_mix.csv', codecs.open, encoding='utf-8')