def test_multiple_ngrams(): """ test n-gram where n is a list of integers""" plates_mottos = [ "WildRose Country", "Canada's Ocean Playground", "Land of Living Skies" ] n_gram_mottos = [] n_gram_mottos.append([ 'WildRose', 'Country', '_ WildRose', 'WildRose Country', 'Country _', '_ _ WildRose', '_ WildRose Country', 'WildRose Country _', 'Country _ _' ]) n_gram_mottos.append([ "Canada's", 'Ocean', 'Playground', "_ Canada's", "Canada's Ocean", 'Ocean Playground', 'Playground _', "_ _ Canada's", "_ Canada's Ocean", "Canada's Ocean Playground", 'Ocean Playground _', 'Playground _ _' ]) n_gram_mottos.append([ 'Land', 'of', 'Living', 'Skies', '_ Land', 'Land of', 'of Living', 'Living Skies', 'Skies _', '_ _ Land', '_ Land of', 'Land of Living', 'of Living Skies', 'Living Skies _', 'Skies _ _' ]) def gen(texts): for line in texts: yield (np.array(line.split(" "), dtype='S'), ) dataset = ds.GeneratorDataset(gen(plates_mottos), column_names=["text"]) dataset = dataset.map(input_columns=["text"], operations=nlp.Ngram([1, 2, 3], ("_", 2), ("_", 2), " ")) i = 0 for data in dataset.create_dict_iterator(): assert [d.decode("utf8") for d in data["text"]] == n_gram_mottos[i] i += 1
def test_config(input_line, n, l_pad=("", 0), r_pad=("", 0), sep=" "): def gen(texts): yield (np.array(texts.split(" "), dtype='S'),) try: dataset = ds.GeneratorDataset(gen(input_line), column_names=["text"]) dataset = dataset.map(input_columns=["text"], operations=text.Ngram(n, l_pad, r_pad, separator=sep)) for data in dataset.create_dict_iterator(): return [d.decode("utf8") for d in data["text"]] except (ValueError, TypeError) as e: return str(e)
def test_ngram_callable(): """ Test ngram op is callable """ op = text.Ngram(2, separator="-") input1 = " WildRose Country" input1 = np.array(input1.split(" "), dtype='S') expect1 = ['-WildRose', 'WildRose-Country'] result1 = op(input1) assert np.array_equal(result1, expect1) input2 = ["WildRose Country", "Canada's Ocean Playground", "Land of Living Skies"] expect2 = ["WildRose Country-Canada's Ocean Playground", "Canada's Ocean Playground-Land of Living Skies"] result2 = op(input2) assert np.array_equal(result2, expect2)
def test_config(input_line, output_line, n, l_pad=None, r_pad=None, sep=None): def gen(text): yield (np.array(text.split(" "), dtype='S'), ) dataset = ds.GeneratorDataset(gen(input_line), column_names=["text"]) dataset = dataset.map(input_columns=["text"], operations=nlp.Ngram(n, l_pad, r_pad, separator=sep)) for data in dataset.create_dict_iterator(): assert [d.decode("utf8") for d in data["text"]] == output_line, output_line
def test_simple_ngram(): """ test simple gram with only one n value""" plates_mottos = ["Friendly Manitoba", "Yours to Discover", "Land of Living Skies", "Birthplace of the Confederation"] n_gram_mottos = [[""]] n_gram_mottos.append(["Yours to Discover"]) n_gram_mottos.append(['Land of Living', 'of Living Skies']) n_gram_mottos.append(['Birthplace of the', 'of the Confederation']) def gen(texts): for line in texts: yield (np.array(line.split(" "), dtype='S'),) dataset = ds.GeneratorDataset(gen(plates_mottos), column_names=["text"]) dataset = dataset.map(input_columns=["text"], operations=text.Ngram(3, separator=" ")) i = 0 for data in dataset.create_dict_iterator(): assert [d.decode("utf8") for d in data["text"]] == n_gram_mottos[i], i i += 1