Example #1
0
def test_multiple_ngrams():
    """ test n-gram where n is a list of integers"""
    plates_mottos = [
        "WildRose Country", "Canada's Ocean Playground", "Land of Living Skies"
    ]
    n_gram_mottos = []
    n_gram_mottos.append([
        'WildRose', 'Country', '_ WildRose', 'WildRose Country', 'Country _',
        '_ _ WildRose', '_ WildRose Country', 'WildRose Country _',
        'Country _ _'
    ])
    n_gram_mottos.append([
        "Canada's", 'Ocean', 'Playground', "_ Canada's", "Canada's Ocean",
        'Ocean Playground', 'Playground _', "_ _ Canada's", "_ Canada's Ocean",
        "Canada's Ocean Playground", 'Ocean Playground _', 'Playground _ _'
    ])
    n_gram_mottos.append([
        'Land', 'of', 'Living', 'Skies', '_ Land', 'Land of', 'of Living',
        'Living Skies', 'Skies _', '_ _ Land', '_ Land of', 'Land of Living',
        'of Living Skies', 'Living Skies _', 'Skies _ _'
    ])

    def gen(texts):
        for line in texts:
            yield (np.array(line.split(" "), dtype='S'), )

    dataset = ds.GeneratorDataset(gen(plates_mottos), column_names=["text"])
    dataset = dataset.map(input_columns=["text"],
                          operations=nlp.Ngram([1, 2, 3], ("_", 2), ("_", 2),
                                               " "))

    i = 0
    for data in dataset.create_dict_iterator():
        assert [d.decode("utf8") for d in data["text"]] == n_gram_mottos[i]
        i += 1
Example #2
0
    def test_config(input_line, n, l_pad=("", 0), r_pad=("", 0), sep=" "):
        def gen(texts):
            yield (np.array(texts.split(" "), dtype='S'),)

        try:
            dataset = ds.GeneratorDataset(gen(input_line), column_names=["text"])
            dataset = dataset.map(input_columns=["text"], operations=text.Ngram(n, l_pad, r_pad, separator=sep))
            for data in dataset.create_dict_iterator():
                return [d.decode("utf8") for d in data["text"]]
        except (ValueError, TypeError) as e:
            return str(e)
Example #3
0
def test_ngram_callable():
    """
    Test ngram op is callable
    """
    op = text.Ngram(2, separator="-")

    input1 = " WildRose Country"
    input1 = np.array(input1.split(" "), dtype='S')
    expect1 = ['-WildRose', 'WildRose-Country']
    result1 = op(input1)
    assert np.array_equal(result1, expect1)

    input2 = ["WildRose Country", "Canada's Ocean Playground", "Land of Living Skies"]
    expect2 = ["WildRose Country-Canada's Ocean Playground", "Canada's Ocean Playground-Land of Living Skies"]
    result2 = op(input2)
    assert np.array_equal(result2, expect2)
Example #4
0
    def test_config(input_line,
                    output_line,
                    n,
                    l_pad=None,
                    r_pad=None,
                    sep=None):
        def gen(text):
            yield (np.array(text.split(" "), dtype='S'), )

        dataset = ds.GeneratorDataset(gen(input_line), column_names=["text"])
        dataset = dataset.map(input_columns=["text"],
                              operations=nlp.Ngram(n,
                                                   l_pad,
                                                   r_pad,
                                                   separator=sep))
        for data in dataset.create_dict_iterator():
            assert [d.decode("utf8")
                    for d in data["text"]] == output_line, output_line
Example #5
0
def test_simple_ngram():
    """ test simple gram with only one n value"""
    plates_mottos = ["Friendly Manitoba", "Yours to Discover", "Land of Living Skies",
                     "Birthplace of the Confederation"]
    n_gram_mottos = [[""]]
    n_gram_mottos.append(["Yours to Discover"])
    n_gram_mottos.append(['Land of Living', 'of Living Skies'])
    n_gram_mottos.append(['Birthplace of the', 'of the Confederation'])

    def gen(texts):
        for line in texts:
            yield (np.array(line.split(" "), dtype='S'),)

    dataset = ds.GeneratorDataset(gen(plates_mottos), column_names=["text"])
    dataset = dataset.map(input_columns=["text"], operations=text.Ngram(3, separator=" "))

    i = 0
    for data in dataset.create_dict_iterator():
        assert [d.decode("utf8") for d in data["text"]] == n_gram_mottos[i], i
        i += 1