def test_semanticizer_nlwiki():
    tempfile = NamedTemporaryFile()
    db = create_model(join(dirname(__file__),
                           'nlwiki-20140927-pages-articles-sample.xml'),
                      tempfile.name)
    sem = Semanticizer(tempfile.name)

    dirs = {d: join(dirname(__file__), 'nlwiki', d)
            for d in "in expected actual".split()}

    input_test_cases = glob(join(dirs['in'], '*'))
    assert_equal(len(input_test_cases), 20,
                 msg=("number of input test cases in %r should be 20"
                      % dirs['in']))

    for doc in input_test_cases:
        fname = basename(doc)
        with open(doc) as f:
            with open(join(dirs['actual'], fname), 'w') as out:
                tokens = f.read().split()
                out.write("\n".join(str(cand)
                                    for cand in sem.all_candidates(tokens)))
        with open(join(dirs['expected'], fname)) as f:
            expected = f.read()
        with open(join(dirs['actual'], fname)) as f:
            actual = f.read()

        assert_multi_line_equal(expected,
                                actual)
def test_semanticizer_nlwiki_no_ngrams():
    tempfile = NamedTemporaryFile()
    db = create_model(join(dirname(__file__),
                           'nlwiki-20140927-pages-articles-sample.xml'),
                      tempfile.name, N=None)
    sem = Semanticizer(tempfile.name)

    assert_true(True)
import re
from os.path import join, dirname
from tempfile import NamedTemporaryFile
from glob import glob
from os.path import basename

from nose.tools import assert_equal, assert_multi_line_equal, assert_true

from semanticizest import Semanticizer
from semanticizest._semanticizer import create_model

tempfile = NamedTemporaryFile()
db = create_model(join(dirname(__file__),
                       'nlwiki-20140927-pages-articles-sample.xml'),
                  tempfile.name)
sem = Semanticizer(tempfile.name)


def test_semanticizer():
    text = """Aangezien de aarde een planeet is, kunnen de aardwetenschappen
ook als een tak van de planetologie beschouwd worden. Aardwetenschappelijke
kennis, met name geomorfologie, wordt bijvoorbeeld ook toegepast voor de
zoektocht naar sporen van water, sneeuw en ijs op de planeet Mars."""
    tokens = re.split(r'\W+', text)

    expected = set(['Planeet', 'Planetologie', 'Kennis (wetenschap)',
                    'Geomorfologie', 'Mars (planeet)'])
    concepts = set(string for _, _, string, _ in sem.all_candidates(tokens))

    assert_equal(expected, concepts)