Exemple #1
0
def update_tf(id_):
    result = gql_client.execute(note_from_id, variable_values={"id": id_})
    note = result["note"]
    html = note["contentHtml"]
    text = html_text.extract_text(html)
    tokenizer = WordTokenizer("sudachi", mode="C", with_postag=True)
    words = tokenizer.tokenize(text)
    hiragana_re = re.compile("[\u3041-\u309F]+")
    number_re = re.compile("[0-9,.]+")
    filtered_words = list(
        filter(
            # lambda x: len(x) > 3 or not hiragana_re.fullmatch(x),
            lambda x: (len(x) > 3 if hiragana_re.fullmatch(x) else len(x) > 1)
            and not number_re.fullmatch(x),
            # map(lambda x: x.normalized_form, filter(lambda x: x.postag in ["名詞", "動詞"], words)),
            map(lambda x: x.surface, filter(lambda x: x.postag in ["名詞"],
                                            words)),
        ))
    num_words = len(filtered_words)
    word_count = Counter(filtered_words)
    word_freq_list = list(
        map(lambda k: (k, word_count[k] / num_words), word_count))
    word_freq = dict(word_freq_list)
    tf_tsv_key = get_tf_tsv_key(id_)
    tf_tsv = "\n".join(map(lambda x: "\t".join(map(str, x)), word_freq_list))
    private_bucket.put_object(Body=tf_tsv.encode("utf-8"), Key=tf_tsv_key)
    return word_freq
Exemple #2
0
def tokenize(params: TokenizeParameter, request: Request):
    if params.texts is not None:
        message = (
            "A parameter `texts` is now unacceptable for /api/v1/tokenize."
            " Please use /api/v1/batch_tokenize instead.")
        raise HTTPException(status_code=400, detail=message)

    if params.text is None:
        raise HTTPException(status_code=400,
                            detail="text or texts is required.")

    cache_key = generate_cache_key(params)
    if cache_key in request.app.state.cache:
        logging.info(f"Hit cache: {cache_key}")
        tokenizer = request.app.state.cache[cache_key]
    else:
        logging.info(f"Create tokenizer: {cache_key}")
        try:
            tokenizer = WordTokenizer(
                tokenizer=params.tokenizer,
                user_dictionary_path=params.user_dictionary_path,
                system_dictionary_path=params.system_dictionary_path,
                model_path=params.model_path,
                mode=params.mode,
                dictionary_format=params.dictionary_format,
            )
            request.app.state.cache[cache_key] = tokenizer
        except Exception:
            raise HTTPException(status_code=400,
                                detail="fail to initialize tokenizer")

    tokens = [token.dict() for token in tokenizer.tokenize(params.text)]
    return {"tokens": tokens}
Exemple #3
0
async def batch_tokenize(params: TokenizeParameter, request: Request):
    if params.texts is None:
        raise HTTPException(status_code=400, detail="texts is required.")

    cache_key = generate_cache_key(params)
    if cache_key in request.app.state.cache:
        logging.info(f"Hit cache: {cache_key}")
        tokenizer = request.app.state.cache[cache_key]
    else:
        logging.info(f"Create tokenizer: {cache_key}")
        try:
            tokenizer = WordTokenizer(
                tokenizer=params.tokenizer,
                user_dictionary_path=params.user_dictionary_path,
                system_dictionary_path=params.system_dictionary_path,
                model_path=params.model_path,
                mode=params.mode,
                dictionary_format=params.dictionary_format,
            )
            request.app.state.cache[cache_key] = tokenizer
        except Exception:
            raise HTTPException(status_code=400,
                                detail="fail to initialize tokenizer")

    tokens_list = [[token.dict() for token in tokenizer.tokenize(text)]
                   for text in params.texts]
    return {"tokens_list": tokens_list}
def test_batch_tokenize_with_character(raw_texts: List[str],
                                       tokenizer_params: Dict):
    tokenizer_name = tokenizer_params["tokenizer"]
    tokenizer = WordTokenizer(**tokenizer_params)
    expect = [[Token.from_dict(token_param) for token_param in token_params]
              for token_params in read_lines(tokenizer_name)]
    result = tokenizer.batch_tokenize(raw_texts)
    assert expect == result
Exemple #5
0
def test_tokenize(raw_texts: List[str], tokenizer_params: Dict):
    tokenizer_name = tokenizer_params["tokenizer"]
    tokenizer = WordTokenizer(**tokenizer_params)
    expect = [
        Token.from_dict(token_param)
        for token_param in read_lines(tokenizer_name)[0]
    ]
    result = tokenizer.tokenize(raw_texts[0])
    assert expect == result
Exemple #6
0
def test_batch_tokenize(raw_texts: List[str], tokenizer_params: Dict):
    if "AWS_ACCESS_KEY_ID" not in os.environ and tokenizer_params[
            "system_dictionary_path"].startswith("s3://"):
        pytest.skip("AWS credentials not found.")

    tokenizer_name = tokenizer_params["tokenizer"]
    tokenizer = WordTokenizer(**tokenizer_params)
    expect = [[Token.from_dict(token_param) for token_param in token_params]
              for token_params in read_lines(tokenizer_name)]
    result = tokenizer.batch_tokenize(raw_texts)
    assert expect == result
Exemple #7
0
def test_batch_tokenize_with_character(raw_texts: List[str],
                                       tokenizer_params: Dict):
    if tokenizer_params["tokenizer"] == "kytea" and sys.version_info < (3, 7):
        pytest.skip("KyTea doesn't work in Python3.6")

    tokenizer_name = tokenizer_params["tokenizer"]
    tokenizer = WordTokenizer(**tokenizer_params)
    expect = [[Token.from_dict(token_param) for token_param in token_params]
              for token_params in read_lines(tokenizer_name)]
    result = tokenizer.batch_tokenize(raw_texts)
    assert expect == result
Exemple #8
0
def preprocess_data(df):
    # split
    df['text'] = df['text'].apply(neologdn.normalize)
    le = preprocessing.LabelEncoder()
    df['label'] = le.fit_transform(df['label'])

    df_train, df_test, y_train, y_test = train_test_split(df,
                                                          df['label'].values,
                                                          test_size=0.2,
                                                          random_state=42,
                                                          stratify=df['label'])

    # tokenize
    tokenizer = WordTokenizer('MeCab')
    docs_train = np.array([
        ' '.join(map(str, tokenizer.tokenize(text)))
        for text in df_train['text']
    ])
    docs_test = np.array([
        ' '.join(map(str, tokenizer.tokenize(text)))
        for text in df_test['text']
    ])

    # tfidf: Don't use df_test for fitting
    count_vec = CountVectorizer(min_df=2,
                                max_features=20000,
                                ngram_range=(1, 3))
    bags_train = count_vec.fit_transform(docs_train)
    bags_test = count_vec.transform(docs_test)

    tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
    tf_idf_train = tfidf.fit_transform(bags_train)
    tf_idf_test = tfidf.transform(bags_test)

    X_train = pd.DataFrame(tf_idf_train.toarray())
    X_test = pd.DataFrame(tf_idf_test.toarray())

    return X_train.reset_index(drop=True), X_test.reset_index(
        drop=True), y_train, y_test
Exemple #9
0
def tokenize(params: TokenizeParameter, request: Request):
    if params.text is not None:
        texts = [params.text]
    elif params.texts is not None:
        texts = params.texts
    else:
        raise HTTPException(status_code=400,
                            detail="text or texts is required.")

    mode = params.mode.lower()
    model_path = ("data/model.spm" if params.tokenizer.lower()
                  == "sentencepiece" else None)  # NOQA

    signature = f"{params.tokenizer}.{model_path}.{mode}"
    if signature in request.app.tokenizers:
        logging.info(f"Hit cache: {signature}")
        tokenizer = request.app.tokenizers[signature]
    else:
        logging.info(f"Create tokenizer: {signature}")
        try:
            tokenizer = WordTokenizer(
                tokenizer=params.tokenizer,
                with_postag=True,
                model_path=model_path,
                mode=mode,
            )
            request.app.tokenizers[signature] = tokenizer
        except Exception:
            raise HTTPException(status_code=400,
                                detail="fail to initialize tokenizer")

    results = [[{
        "surface": t.surface,
        "part_of_speech": t.postag
    } for t in tokenizer.tokenize(text)] for text in texts]

    return {"tokens": results}
Exemple #10
0
            if n_features != expected_n_features:
                raise ValueError("Input has n_features=%d while the model"
                                 " has been trained with n_features=%d" % (
                                     n_features, expected_n_features))
            X = X * self._idf_diag

        return X


if __name__ == '__main__':
    df = load_pandas_df(nrows=10)

    # Normalization
    df['text'] = df['text'].apply(neologdn.normalize)

    tokenizer = WordTokenizer('MeCab')
    docs = np.array([
        ' '.join(map(str, tokenizer.tokenize(text))) for text in df['text']
    ])
    print(docs.shape)
    # (10,)

    count_vec = CountVectorizer(min_df=2,
                                max_features=20000,
                                ngram_range=(1, 3))
    bags = count_vec.fit_transform(docs)

    print(bags.toarray().shape)
    print(bags.toarray())
    """
    (10, 445)
Exemple #11
0
sys.path.append("nn4nlp-code/01-intro-pytorch")

import torch
from torch import nn
from torch.autograd import Variable
from model import DeepCBoW
import pickle
import pandas as pd
import random
import time
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from konoha import WordTokenizer

konoha_tokenizer = WordTokenizer('Sentencepiece',
                                 model_path="/home/icaro/konoha_model.spm")

nlayers, emb_size, hid_size = 3, 6, 6

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
w2i = defaultdict(lambda: len(w2i))  #word to index
w2i["<unk>"]
t2i = defaultdict(lambda: len(t2i))  #tag to index


def read_dataset(df):
    for i, row in df.iterrows():
        tag = row["c"]
        words = row["p"].lower().strip()
        yield ([w2i[str(x)]
                for x in konoha_tokenizer.tokenize(words)], t2i[tag])
Exemple #12
0
from konoha import SentenceTokenizer
from konoha import WordTokenizer


if __name__ == "__main__":
    sentence_tokenizer = SentenceTokenizer()
    tokenizers = ["MeCab", "KyTea", "Janome", "Character"]
    tokenizers_support_postag = ["MeCab", "KyTea", "Janome"]

    word_tokenizers = []
    for word_tokenizer_name in tokenizers:
        try:
            _tokenizer = WordTokenizer(word_tokenizer_name)
            word_tokenizers.append(_tokenizer)

            if word_tokenizer_name in tokenizers_support_postag:
                _tokenizer = WordTokenizer(word_tokenizer_name, with_postag=True)
                word_tokenizers.append(_tokenizer)

        except (ImportError, RuntimeError):
            print("Skip: ", word_tokenizer_name)

    try:
        _tokenizer = WordTokenizer(
            "Sentencepiece", model_path="./data/model.spm"
        )  # NOQA
        word_tokenizers.append(_tokenizer)

    except (ImportError, OSError, RuntimeError):
        print("Skip: ", "Sentencepiece")
Exemple #13
0
from konoha import WordTokenizer

from utils_nlp.dataset.livedoor import load_pandas_df

if __name__ == '__main__':
    df = load_pandas_df(nrows=10)
    text = df['text'][0][:30]
    print(text)
    # 友人代表のスピーチ、独女はどうこなしている?もうすぐジューン

    tokenizer_m = WordTokenizer('MeCab')
    print(tokenizer_m.tokenize(text))
    # [友人, 代表, の, スピーチ, 、, 独, 女, は, どう, こなし, て, いる, ?, もうすぐ, ジューン]

    tokenizer_s = WordTokenizer('Sudachi', mode='A', with_postag=True)
    print(tokenizer_s.tokenize(text))
    # [友人 (名詞), 代表 (名詞), の (助詞), スピーチ (名詞), 、 (補助記号), 独女 (名詞), は (助詞), どう (副詞), こなし (動詞), て (助詞), いる (動詞), ? (補助記号), もう (副詞), すぐ (副詞), ジューン (名詞)]

    df['sep_text'] = [tokenizer_m.tokenize(text) for text in df['text']]
    print(df.head())
Exemple #14
0
from konoha import SentenceTokenizer
from konoha import WordTokenizer


if __name__ == "__main__":
    sentence_tokenizer = SentenceTokenizer()
    tokenizers = ["MeCab", "KyTea", "Janome", "nagisa", "Character"]
    tokenizers_support_postag = ["MeCab", "KyTea", "Janome", "nagisa"]

    word_tokenizers = []
    for word_tokenizer_name in tokenizers:
        try:
            _tokenizer = WordTokenizer(word_tokenizer_name)
            word_tokenizers.append(_tokenizer)

            if word_tokenizer_name in tokenizers_support_postag:
                _tokenizer = WordTokenizer(word_tokenizer_name)
                word_tokenizers.append(_tokenizer)

        except (ImportError, RuntimeError):
            print("Skip: ", word_tokenizer_name)

    try:
        _tokenizer = WordTokenizer(
            "Sentencepiece", model_path="./data/model.spm"
        )  # NOQA
        word_tokenizers.append(_tokenizer)

    except (ImportError, OSError, RuntimeError):
        print("Skip: ", "Sentencepiece")
Exemple #15
0
from konoha import WordTokenizer
import neologdn
import numpy as np

from utils_nlp.dataset.livedoor import load_pandas_df
from utils_nlp.features import scdv
from utils_nlp.models.pretrained_embeddings.word2vec import load_pretrained_vectors

if __name__ == '__main__':
    df = load_pandas_df(nrows=10)

    # Normalization
    df['text'] = df['text'].apply(neologdn.normalize)

    tokenizer = WordTokenizer('MeCab')
    docs = np.array(
        [map(str, tokenizer.tokenize(text)) for text in df['text']])
    print(docs.shape)
    # (10,)

    word_vec = load_pretrained_vectors('data')
    scdv = scdv.create(docs, word_vec, n_components=10)
    print(scdv.shape)
    # (10, 3000)