def test_counts_dump_2(germaparl): strategy = 2 corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.dump_from_query('[lemma="Helmut"%cd] [lemma="Kohl"%cd]') df = corpus.counts.dump(dump, p_atts=['word'], split=True, strategy=strategy) assert (df["freq"]["Helmut"] == 6) df = corpus.counts.dump(dump, p_atts=['word', 'pos'], split=True, strategy=strategy) assert (df["freq"][("Helmut", "NE")] == 6) df = corpus.counts.dump(dump, p_atts=['word'], split=False, strategy=strategy) assert ("Helmut Kohl" in df.index) assert (df["freq"].iloc[0] == 6) df = corpus.counts.dump(dump, p_atts=['word', 'pos'], split=False, strategy=strategy) assert (("Helmut Kohl", "NE NE") in df.index) assert (df["freq"].iloc[0] == 6)
def test_counts_dump_1_split(germaparl): strategy = 1 corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.dump_from_query('[lemma="die" %cd] [pos="N.*"]') df = corpus.counts.dump(dump, p_atts=['word'], split=True, strategy=strategy) assert (int(df["freq"]["der"]) == 3775) df = corpus.counts.dump(dump, p_atts=['word', 'lemma'], split=True, strategy=strategy) assert (int(df["freq"][("der", "die")]) == 3775)
def test_counts_dump_1_no_split(germaparl): strategy = 1 corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.dump_from_query('[lemma="Helmut"%cd] [lemma="Kohl"%cd]') # no split df = corpus.counts.dump(dump, p_atts=['word'], split=False, strategy=strategy) assert ("Helmut Kohl" in df.index) df = corpus.counts.dump(dump, p_atts=['word', 'pos'], split=False, strategy=strategy) assert (("Helmut Kohl", "NE NE") in df.index)