예제 #1
0
def test_normalization():
    stopwords = tokenization.load_stopwords("ru")

    input_str = "Я увидел! 32,    тестирование 14:32 в по-новому."
    true_result = ["увидеть", "32", "тестирование", "14", "32", "новый"]
    result = tokenization.preprocess_text(input_str, stopwords)
    assert result == true_result
예제 #2
0
파일: index.py 프로젝트: gooppe/ir-20
def index_json(
    filename: str,
    index_name: str,
    target_collumn: int,
    buffer_size: int = 10000,
    lang: str = "en",
):
    """Start indexing json file.

    Args:
        filename (str): name of source file.
        index_name (str): name of output index file.
        target_collumn (int): index of the processed column of json.
        buffer_size (int, optional): default buffer size. Defaults to 10000.
    """
    stopwords = load_stopwords(lang)

    with open(filename, encoding="utf8") as file:
        total = sum(1 for line in file)

    data = iter_json(filename)
    data = ((i, preprocess_text(tup[target_collumn], stopwords))
            for i, tup in enumerate(data))
    data = tqdm(data, desc="Building Index", total=total)

    build_index(data, index_name, buffer_size)
예제 #3
0
def test_drop_stopwords():
    stopwords = tokenization.load_stopwords("ru")

    test_text = ["я", "ходить", "в", "университет", "каждый", "день"]
    true_dropped = ["ходить", "университет", "каждый", "день"]
    dropped = tokenization.drop_stopwords(test_text, stopwords)

    assert dropped == true_dropped
예제 #4
0
파일: search.py 프로젝트: gooppe/ir-20
def cli_text_search(
    text: str,
    dump_dir: str,
    data_file: str,
    n_results: int = 10,
    suggestion: bool = True,
    lang: str = "en",
):
    if suggestion:
        text = sample_text(text)
        print(f"Auto suggestion: {text}")

    stopwords = tokenization.load_stopwords(lang)
    tokens = [
        t for t in re.split(r"\W+",
                            text.lower().strip()) if t not in stopwords
    ]
    if len(tokens) > 1:
        query = Or(*symbols(",".join(tokens)))
    else:
        query = Symbol(tokens[0])
    cli_search(query, dump_dir, data_file, n_results, text)
예제 #5
0
def test_load_stopwords():
    stopwords = tokenization.load_stopwords("ru")

    assert isinstance(stopwords, set)
    assert len(stopwords) > 0
    assert "и" in stopwords