Esempio n. 1
0
def mturk_test():

    embedding_dim = 300

    Y = get_mturk_outcomes()
    Y = (Y - np.mean(Y, axis=0)) / np.std(Y, axis=0)

    X = np.load('../data/mturk_embedded.npz')

    # X0 = X['arr_0']
    # X1 = X['arr_1']
    X2 = X['arr_2']
    # X3 = X['arr_3']

    m = SWEM(embedding_dimension=embedding_dim,
             num_outputs=2,
             learning_rate=1e-4,
             activation_fn=tf.nn.elu,
             embedding_mlp_depth=2,
             prediction_mlp_layers=(120, 24))

    # m.train(X0, Y, plotfile='../img/X0_training.png')
    m.train(X2,
            Y[:, :2],
            plotfile='../img/X2_Y01_training.png',
            batch_size=100,
            epochs=20)  # m.train(X2, Y, plotfile='../img/X2_training.png')
Esempio n. 2
0
def random_noise_test():

    embedding_dim = 300
    data_size = 1000

    X = [
        np.random.randn(np.random.randint(10, 100), embedding_dim)
        for i in range(data_size)
    ]

    Y = .2 * np.random.randn(data_size) + .5

    m = SWEM(embedding_dimension=embedding_dim)

    m.train(X, Y, plotfile='../img/test_training.png')
def get(ddir: str, ft_path: str, split: str):
    random.seed(1111)
    ddir = Path(ddir)

    ft_model = fastText.load_model(ft_path)
    swem = SWEM(ft_model)

    quality = lf.TextDataset(str(ddir / (f'quality.{split}.txt'))).map(int)
    sent1 = lf.TextDataset(str(ddir / (f'sent1.{split}.txt'))).map(sent_preprocess(swem))
    sent2 = lf.TextDataset(str(ddir / (f'sent2.{split}.txt'))).map(sent_preprocess(swem))

    ds = lf.zip(quality, sent1, sent2)
    return ds
def test_get(ddir: str, savedir: str, bsize: int, ft_path: str):
    ddir = Path(ddir)
    savedir = Path(savedir)

    ft_model = fastText.load_model(ft_path)
    swem = SWEM(ft_model)

    quality = lf.TextDataset(str(ddir / ('quality.test.txt'))).map(int)
    sent1 = lf.TextDataset(str(ddir / ('sent1.test.txt'))).map(sent_preprocess(swem))
    sent2 = lf.TextDataset(str(ddir / ('sent2.test.txt'))).map(sent_preprocess(swem))

    ds = lf.zip(quality, sent1, sent2)

    test_dataloader = DataLoader(
            ds.save(savedir / 'swem.test.cache'),
            batch_size=bsize,
            shuffle=False,
            num_workers=4,
            collate_fn=get_collate_fn()
            )

    return test_dataloader
Esempio n. 5
0
def main(name_list_file, postfix, w2v_model, pkl_filename):
    f = open(name_list_file)
    reader = csv.reader(f)
    header = next(reader)
    nouns_set = build_nouns_set(name_list_file, postfix)

    # Word2Vecモデルのロード時、モデルファイルの形式にあわせてロード手順を変えること

    # KeyedVectorsの場合(save_word2vec_formatで保存したもの)
    from gensim.models import KeyedVectors
    w2v = KeyedVectors.load_word2vec_format(w2v_model,
                                            binary=True)  # .bin形式の場合
    # w2v = KeyedVectors.load_word2vec_format(w2v_model, binary=False)  # .txt形式の場合

    # Word2Vec.saveで保存したもの
    # import gensim.models.doc2vec as doc2vec
    # w2v = doc2vec.Doc2Vec.load("pixiv/doc2vec.model")

    # fastTextモデルの場合
    # from gensim.models.wrappers.fasttext import FastText
    # w2v = FastText.load_fasttext_format('pixiv/fasttext-model.bin')

    tokenizer = MeCabTokenizerWithStopWord(
        mecab_args=f"-O wakati -d {mecab_system_dic}", nouns=nouns_set)
    swem = SWEM(w2v, tokenizer)

    names = []
    vecs = []
    attributes = {}
    for row in reader:
        name = row[0]
        nickname = row[1]
        attribute = row[2]
        if attribute not in attributes:
            attributes[attribute] = []

        with open("data/" + name + ".txt") as n:
            text = n.read()
            text = normalize(text)

            vec = swem.average_pooling(text)
            # vec = swem.max_pooling(text)

            if postfix in name:
                name = name.replace(postfix, "")
            names.append(name)
            vecs.append(vec)
            attributes[attribute].append(vec)

    # 各属性の平均ベクトルを作る
    for key in attributes:
        ave = np.average([v for v in attributes[key]], axis=0)
        names.append(key)
        vecs.append(ave)

    f.close()
    idolvecs = [names, vecs]

    # pklファイルへ保存
    with open(pkl_filename, 'wb') as pkl:
        pickle.dump(idolvecs, pkl)
def get_request(doc):
    return {
        "_op_type": "index",
        "_index": INDEX_NAME,
        "text": doc["text"],
        "title": doc["title"],
        "text_vector": swem.average_pooling(doc["text"]).tolist()
    }


# embedding
w2v_path = "jawiki.word_vectors.200d.txt"
w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=False)
tokenizer = MeCabTokenizer("-O wakati")
swem = SWEM(w2v, tokenizer)

# elasticsearch
client = Elasticsearch("http://es-study:9200")
BATCH_SIZE = 1000
INDEX_NAME = "wikipedia"

client.indices.delete(index=INDEX_NAME, ignore=[404])
with open("index.json") as index_file:
    source = index_file.read().strip()
    client.indices.create(index=INDEX_NAME, body=source)

docs = []
count = 0
with gzip.open("jawikisource-20210510-cirrussearch-content.json.gz") as f:
    for line in f: