def test_unicode_in_memory(): def generate_random_unicode(): for _ in xrange(5): yield unichr( random.choice((0x300, 0x9999)) + random.randint(0, 0xff)) scramblemap = {} G = LineFileInMemory("tests/smallcorpus.txt.bz2", header="foo bar baz qux".split(), path="tests/tmp/testcorpus") G.clean(lower=True, alphanumeric=False, count_columns=False, echo_toss=True, lazy=True) G.make_marginal_column("quux", "foo bar".split(), "qux", lazy=True) G.sort("baz") len_G = len(G) sum_counts = G.sum_column("quux") sum_surprisal = math.fsum(line[2] for line in G.average_surprisal( "baz", "qux", "quux", assert_sorted=True)) G = LineFileInMemory("tests/smallcorpus.txt.bz2", header="foo bar baz qux".split(), path="tests/tmp/testcorpus") def scramble(line): words = line.split()[:3] count = line.split()[-1] for i, word in enumerate(words): if word in scramblemap: words[i] = scramblemap[word] else: garbage = u"".join(generate_random_unicode()) words[i] = garbage scramblemap[word] = garbage return "\t".join(words + [count]) G.clean(lower=True, alphanumeric=False, count_columns=False, echo_toss=True, modifier_fn=scramble, lazy=True) G.make_marginal_column("quux", "foo bar".split(), "qux", lazy=True) G.sort("baz") sum_counts_scrambled = G.sum_column("quux") assert_equal(sum_counts, sum_counts_scrambled) assert_equal(len_G, len(G)) sum_surprisal_scrambled = math.fsum( line[2] for line in G.average_surprisal( "baz", "qux", "quux", assert_sorted=True)) assert_equal(sum_surprisal, sum_surprisal_scrambled)
def test_unicode_in_memory(): def generate_random_unicode(): for _ in xrange(5): yield unichr(random.choice((0x300, 0x9999)) + random.randint(0, 0xff)) scramblemap = {} G = LineFileInMemory("tests/smallcorpus.txt.bz2", header="foo bar baz qux".split(), path="tests/tmp/testcorpus") G.clean(lower=True, alphanumeric=False, count_columns=False, echo_toss=True, lazy=True) G.make_marginal_column("quux", "foo bar".split(), "qux", lazy=True) G.sort("baz") len_G = len(G) sum_counts = G.sum_column("quux") sum_surprisal = math.fsum(line[2] for line in G.average_surprisal("baz", "qux", "quux", assert_sorted=True)) G = LineFileInMemory("tests/smallcorpus.txt.bz2", header="foo bar baz qux".split(), path="tests/tmp/testcorpus") def scramble(line): words = line.split()[:3] count = line.split()[-1] for i, word in enumerate(words): if word in scramblemap: words[i] = scramblemap[word] else: garbage = u"".join(generate_random_unicode()) words[i] = garbage scramblemap[word] = garbage return "\t".join(words + [count]) G.clean(lower=True, alphanumeric=False, count_columns=False, echo_toss=True, modifier_fn=scramble, lazy=True) G.make_marginal_column("quux", "foo bar".split(), "qux", lazy=True) G.sort("baz") sum_counts_scrambled = G.sum_column("quux") assert_equal(sum_counts, sum_counts_scrambled) assert_equal(len_G, len(G)) sum_surprisal_scrambled = math.fsum(line[2] for line in G.average_surprisal("baz", "qux", "quux", assert_sorted=True)) assert_equal(sum_surprisal, sum_surprisal_scrambled)