def update_tf(id_): result = gql_client.execute(note_from_id, variable_values={"id": id_}) note = result["note"] html = note["contentHtml"] text = html_text.extract_text(html) tokenizer = WordTokenizer("sudachi", mode="C", with_postag=True) words = tokenizer.tokenize(text) hiragana_re = re.compile("[\u3041-\u309F]+") number_re = re.compile("[0-9,.]+") filtered_words = list( filter( # lambda x: len(x) > 3 or not hiragana_re.fullmatch(x), lambda x: (len(x) > 3 if hiragana_re.fullmatch(x) else len(x) > 1) and not number_re.fullmatch(x), # map(lambda x: x.normalized_form, filter(lambda x: x.postag in ["名詞", "動詞"], words)), map(lambda x: x.surface, filter(lambda x: x.postag in ["名詞"], words)), )) num_words = len(filtered_words) word_count = Counter(filtered_words) word_freq_list = list( map(lambda k: (k, word_count[k] / num_words), word_count)) word_freq = dict(word_freq_list) tf_tsv_key = get_tf_tsv_key(id_) tf_tsv = "\n".join(map(lambda x: "\t".join(map(str, x)), word_freq_list)) private_bucket.put_object(Body=tf_tsv.encode("utf-8"), Key=tf_tsv_key) return word_freq
def tokenize(params: TokenizeParameter, request: Request): if params.texts is not None: message = ( "A parameter `texts` is now unacceptable for /api/v1/tokenize." " Please use /api/v1/batch_tokenize instead.") raise HTTPException(status_code=400, detail=message) if params.text is None: raise HTTPException(status_code=400, detail="text or texts is required.") cache_key = generate_cache_key(params) if cache_key in request.app.state.cache: logging.info(f"Hit cache: {cache_key}") tokenizer = request.app.state.cache[cache_key] else: logging.info(f"Create tokenizer: {cache_key}") try: tokenizer = WordTokenizer( tokenizer=params.tokenizer, user_dictionary_path=params.user_dictionary_path, system_dictionary_path=params.system_dictionary_path, model_path=params.model_path, mode=params.mode, dictionary_format=params.dictionary_format, ) request.app.state.cache[cache_key] = tokenizer except Exception: raise HTTPException(status_code=400, detail="fail to initialize tokenizer") tokens = [token.dict() for token in tokenizer.tokenize(params.text)] return {"tokens": tokens}
async def batch_tokenize(params: TokenizeParameter, request: Request): if params.texts is None: raise HTTPException(status_code=400, detail="texts is required.") cache_key = generate_cache_key(params) if cache_key in request.app.state.cache: logging.info(f"Hit cache: {cache_key}") tokenizer = request.app.state.cache[cache_key] else: logging.info(f"Create tokenizer: {cache_key}") try: tokenizer = WordTokenizer( tokenizer=params.tokenizer, user_dictionary_path=params.user_dictionary_path, system_dictionary_path=params.system_dictionary_path, model_path=params.model_path, mode=params.mode, dictionary_format=params.dictionary_format, ) request.app.state.cache[cache_key] = tokenizer except Exception: raise HTTPException(status_code=400, detail="fail to initialize tokenizer") tokens_list = [[token.dict() for token in tokenizer.tokenize(text)] for text in params.texts] return {"tokens_list": tokens_list}
def test_batch_tokenize_with_character(raw_texts: List[str], tokenizer_params: Dict): tokenizer_name = tokenizer_params["tokenizer"] tokenizer = WordTokenizer(**tokenizer_params) expect = [[Token.from_dict(token_param) for token_param in token_params] for token_params in read_lines(tokenizer_name)] result = tokenizer.batch_tokenize(raw_texts) assert expect == result
def test_tokenize(raw_texts: List[str], tokenizer_params: Dict): tokenizer_name = tokenizer_params["tokenizer"] tokenizer = WordTokenizer(**tokenizer_params) expect = [ Token.from_dict(token_param) for token_param in read_lines(tokenizer_name)[0] ] result = tokenizer.tokenize(raw_texts[0]) assert expect == result
def test_batch_tokenize(raw_texts: List[str], tokenizer_params: Dict): if "AWS_ACCESS_KEY_ID" not in os.environ and tokenizer_params[ "system_dictionary_path"].startswith("s3://"): pytest.skip("AWS credentials not found.") tokenizer_name = tokenizer_params["tokenizer"] tokenizer = WordTokenizer(**tokenizer_params) expect = [[Token.from_dict(token_param) for token_param in token_params] for token_params in read_lines(tokenizer_name)] result = tokenizer.batch_tokenize(raw_texts) assert expect == result
def test_batch_tokenize_with_character(raw_texts: List[str], tokenizer_params: Dict): if tokenizer_params["tokenizer"] == "kytea" and sys.version_info < (3, 7): pytest.skip("KyTea doesn't work in Python3.6") tokenizer_name = tokenizer_params["tokenizer"] tokenizer = WordTokenizer(**tokenizer_params) expect = [[Token.from_dict(token_param) for token_param in token_params] for token_params in read_lines(tokenizer_name)] result = tokenizer.batch_tokenize(raw_texts) assert expect == result
def preprocess_data(df): # split df['text'] = df['text'].apply(neologdn.normalize) le = preprocessing.LabelEncoder() df['label'] = le.fit_transform(df['label']) df_train, df_test, y_train, y_test = train_test_split(df, df['label'].values, test_size=0.2, random_state=42, stratify=df['label']) # tokenize tokenizer = WordTokenizer('MeCab') docs_train = np.array([ ' '.join(map(str, tokenizer.tokenize(text))) for text in df_train['text'] ]) docs_test = np.array([ ' '.join(map(str, tokenizer.tokenize(text))) for text in df_test['text'] ]) # tfidf: Don't use df_test for fitting count_vec = CountVectorizer(min_df=2, max_features=20000, ngram_range=(1, 3)) bags_train = count_vec.fit_transform(docs_train) bags_test = count_vec.transform(docs_test) tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True) tf_idf_train = tfidf.fit_transform(bags_train) tf_idf_test = tfidf.transform(bags_test) X_train = pd.DataFrame(tf_idf_train.toarray()) X_test = pd.DataFrame(tf_idf_test.toarray()) return X_train.reset_index(drop=True), X_test.reset_index( drop=True), y_train, y_test
def tokenize(params: TokenizeParameter, request: Request): if params.text is not None: texts = [params.text] elif params.texts is not None: texts = params.texts else: raise HTTPException(status_code=400, detail="text or texts is required.") mode = params.mode.lower() model_path = ("data/model.spm" if params.tokenizer.lower() == "sentencepiece" else None) # NOQA signature = f"{params.tokenizer}.{model_path}.{mode}" if signature in request.app.tokenizers: logging.info(f"Hit cache: {signature}") tokenizer = request.app.tokenizers[signature] else: logging.info(f"Create tokenizer: {signature}") try: tokenizer = WordTokenizer( tokenizer=params.tokenizer, with_postag=True, model_path=model_path, mode=mode, ) request.app.tokenizers[signature] = tokenizer except Exception: raise HTTPException(status_code=400, detail="fail to initialize tokenizer") results = [[{ "surface": t.surface, "part_of_speech": t.postag } for t in tokenizer.tokenize(text)] for text in texts] return {"tokens": results}
if n_features != expected_n_features: raise ValueError("Input has n_features=%d while the model" " has been trained with n_features=%d" % ( n_features, expected_n_features)) X = X * self._idf_diag return X if __name__ == '__main__': df = load_pandas_df(nrows=10) # Normalization df['text'] = df['text'].apply(neologdn.normalize) tokenizer = WordTokenizer('MeCab') docs = np.array([ ' '.join(map(str, tokenizer.tokenize(text))) for text in df['text'] ]) print(docs.shape) # (10,) count_vec = CountVectorizer(min_df=2, max_features=20000, ngram_range=(1, 3)) bags = count_vec.fit_transform(docs) print(bags.toarray().shape) print(bags.toarray()) """ (10, 445)
sys.path.append("nn4nlp-code/01-intro-pytorch") import torch from torch import nn from torch.autograd import Variable from model import DeepCBoW import pickle import pandas as pd import random import time import numpy as np from tqdm import tqdm from collections import defaultdict from konoha import WordTokenizer konoha_tokenizer = WordTokenizer('Sentencepiece', model_path="/home/icaro/konoha_model.spm") nlayers, emb_size, hid_size = 3, 6, 6 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") w2i = defaultdict(lambda: len(w2i)) #word to index w2i["<unk>"] t2i = defaultdict(lambda: len(t2i)) #tag to index def read_dataset(df): for i, row in df.iterrows(): tag = row["c"] words = row["p"].lower().strip() yield ([w2i[str(x)] for x in konoha_tokenizer.tokenize(words)], t2i[tag])
from konoha import SentenceTokenizer from konoha import WordTokenizer if __name__ == "__main__": sentence_tokenizer = SentenceTokenizer() tokenizers = ["MeCab", "KyTea", "Janome", "Character"] tokenizers_support_postag = ["MeCab", "KyTea", "Janome"] word_tokenizers = [] for word_tokenizer_name in tokenizers: try: _tokenizer = WordTokenizer(word_tokenizer_name) word_tokenizers.append(_tokenizer) if word_tokenizer_name in tokenizers_support_postag: _tokenizer = WordTokenizer(word_tokenizer_name, with_postag=True) word_tokenizers.append(_tokenizer) except (ImportError, RuntimeError): print("Skip: ", word_tokenizer_name) try: _tokenizer = WordTokenizer( "Sentencepiece", model_path="./data/model.spm" ) # NOQA word_tokenizers.append(_tokenizer) except (ImportError, OSError, RuntimeError): print("Skip: ", "Sentencepiece")
from konoha import WordTokenizer from utils_nlp.dataset.livedoor import load_pandas_df if __name__ == '__main__': df = load_pandas_df(nrows=10) text = df['text'][0][:30] print(text) # 友人代表のスピーチ、独女はどうこなしている?もうすぐジューン tokenizer_m = WordTokenizer('MeCab') print(tokenizer_m.tokenize(text)) # [友人, 代表, の, スピーチ, 、, 独, 女, は, どう, こなし, て, いる, ?, もうすぐ, ジューン] tokenizer_s = WordTokenizer('Sudachi', mode='A', with_postag=True) print(tokenizer_s.tokenize(text)) # [友人 (名詞), 代表 (名詞), の (助詞), スピーチ (名詞), 、 (補助記号), 独女 (名詞), は (助詞), どう (副詞), こなし (動詞), て (助詞), いる (動詞), ? (補助記号), もう (副詞), すぐ (副詞), ジューン (名詞)] df['sep_text'] = [tokenizer_m.tokenize(text) for text in df['text']] print(df.head())
from konoha import SentenceTokenizer from konoha import WordTokenizer if __name__ == "__main__": sentence_tokenizer = SentenceTokenizer() tokenizers = ["MeCab", "KyTea", "Janome", "nagisa", "Character"] tokenizers_support_postag = ["MeCab", "KyTea", "Janome", "nagisa"] word_tokenizers = [] for word_tokenizer_name in tokenizers: try: _tokenizer = WordTokenizer(word_tokenizer_name) word_tokenizers.append(_tokenizer) if word_tokenizer_name in tokenizers_support_postag: _tokenizer = WordTokenizer(word_tokenizer_name) word_tokenizers.append(_tokenizer) except (ImportError, RuntimeError): print("Skip: ", word_tokenizer_name) try: _tokenizer = WordTokenizer( "Sentencepiece", model_path="./data/model.spm" ) # NOQA word_tokenizers.append(_tokenizer) except (ImportError, OSError, RuntimeError): print("Skip: ", "Sentencepiece")
from konoha import WordTokenizer import neologdn import numpy as np from utils_nlp.dataset.livedoor import load_pandas_df from utils_nlp.features import scdv from utils_nlp.models.pretrained_embeddings.word2vec import load_pretrained_vectors if __name__ == '__main__': df = load_pandas_df(nrows=10) # Normalization df['text'] = df['text'].apply(neologdn.normalize) tokenizer = WordTokenizer('MeCab') docs = np.array( [map(str, tokenizer.tokenize(text)) for text in df['text']]) print(docs.shape) # (10,) word_vec = load_pretrained_vectors('data') scdv = scdv.create(docs, word_vec, n_components=10) print(scdv.shape) # (10, 3000)