Ejemplo n.º 1
0
def test_split_on_first_whitespace():
    # Only two words.
    assert ('hello', 'world') == utils.split_on_first_whitespace('hello world')

    # More than two words.
    assert ('hello', 'world two') == \
        utils.split_on_first_whitespace('hello world two')

    # Should not fail on empty string.
    assert ('', '') == utils.split_on_first_whitespace('')
Ejemplo n.º 2
0
def test_split_on_first_whitespace():
    # Only two words.
    assert ('hello', 'world') == utils.split_on_first_whitespace('hello world')

    # More than two words.
    assert ('hello', 'world two') == \
        utils.split_on_first_whitespace('hello world two')

    # Should not fail on empty string.
    assert ('', '') == utils.split_on_first_whitespace('')
Ejemplo n.º 3
0
def build_LM(in_file):
    """
    build language models for each label
    each line in in_file contains a label and an URL separated by a tab(\t)
    """
    print 'Building language models...'
    tokenizer = get_tokenizer()
    language_models = {}
    all_grams = set()

    with open(in_file) as in_file_contents:
        for line in in_file_contents:
            lang, text = utils.split_on_first_whitespace(line)
            language_models.setdefault(lang, model.Model())

            language_model = language_models[lang]
            for gram in tokenizer(text):
                all_grams.add(gram)

                language_model.incr_gram_count(gram)

    for lang in language_models:
        language_model = language_models[lang]
        for gram in all_grams:
            language_model.register_gram(gram)

    return language_models
Ejemplo n.º 4
0
def build_LM(in_file):
    """
    build language models for each label
    each line in in_file contains a label and an URL separated by a tab(\t)
    """
    print 'Building language models...'
    tokenizer = get_tokenizer()
    language_models = {}
    all_grams = set()

    with open(in_file) as in_file_contents:
        for line in in_file_contents:
            lang, text = utils.split_on_first_whitespace(line)
            language_models.setdefault(lang, model.Model())

            language_model = language_models[lang]
            for gram in tokenizer(text):
                all_grams.add(gram)

                language_model.incr_gram_count(gram)

    for lang in language_models:
        language_model = language_models[lang]
        for gram in all_grams:
            language_model.register_gram(gram)

    return language_models