Beispiel #1
0
def main():
    data_dir = _LOG_DIR
    pos_dir = _POS_DIR
    rne_dir = _RNE_DIR
    word_list = cm.generate_wordlist(data_dir)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    corpus = np.array([word_to_id[w] for w in word_list])
    vocab_size = len(id_to_word)
    feature = fs.extract_feature('trigram', data_dir, pos_dir, rne_dir)

    ppmi_score = ppmi(feature)
    print('ppmi')
    print(ppmi_score)

    plot_vector(ppmi_score, word_to_id)

    querys = ['切']
    rank_similarity(ppmi_score, querys, word_to_id, id_to_word)
Beispiel #2
0
def main():
    # ---------------
    # create feature
    # ---------------
    word_list = cm.generate_wordlist(_LOG_DIR)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    matrix = fs.extract_feature('agent', _LOG_DIR, _POS_DIR, _RNE_DIR)
    print('matrix')
    print(matrix)

    # -----------
    # evaluation
    # -----------
    file_list = [os.path.join(_RESULT_DIR, f) for f in os.listdir(_RESULT_DIR)]
    filenames = [os.path.basename(f) for f in file_list]
    dirnames = [os.path.join(_DST_DIR, split_fname_ext(f)) for f in filenames]
    print('dirnames')
    print(dirnames)
    make_dirs = [mkdir_if_not_exists(os.path.join(_DST_DIR, d)) for d in dirnames]
    evaluation = [eval_run(_RESULT_DIR, f, word_to_id, matrix, d)
                  for f, d in zip(file_list, dirnames)]
Beispiel #3
0
def main():
    word_list = cm.generate_wordlist(_CORPUS_DIR)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    cm.id_to_word_to_txt(id_to_word)
    corpus = np.array([word_to_id[w] for w in word_list])
    vocab_size = len(id_to_word)

    df = pd.read_csv('lr_train_20190425.csv')
    print(df.head())

    # # ---------------------------
    # # adjust the number of data
    # # ---------------------------
    # df_0 = df[df['label'] == 0]
    # df_1 = df[df['label'] == 1]
    # print('0')
    # print(len(df_0))
    # print('1')
    # print(len(df_1))

    # X_0 = df_0[:4000]
    # X_1 = df_1

    # df = pd.concat([X_0, X_1])
    # print(len(df))
    # # ---------------------------

    # -------
    # train
    # -------
    X_org_word = df['org'].values
    X_dst_word = df['dst'].values
    y = df['label'].values

    X_org_to_id = np.array([word_to_id[x] for x in X_org_word])
    X_dst_to_id = np.array([word_to_id[x] for x in X_dst_word])
    print('X_org_to_id')
    print(type(X_org_to_id))
    print('X_dst_to_id')
    print(type(X_dst_to_id))

    print('X_ort_to_id')
    print(X_org_to_id)
    print('X_dst_to_id')
    print(X_dst_to_id)

    del df
    del X_org_word, X_dst_word
    del word_to_id, id_to_word
    gc.collect()

    matrix = fs.extract_feature(_CORPUS_DIR, 'procedure')
    print('matrix')
    print(matrix)
    print('matrix shape')
    print(matrix.shape)
    org_split_ids = np.array_split(X_org_to_id, 10)
    dst_split_ids = np.array_split(X_dst_to_id, 10)
    # print('org_split_ids')
    # print(org_split_ids)
    # print(len(org_split_ids))
    # print('dst_split_ids')
    # print(dst_split_ids)
    # print(len(dst_split_ids))
    X = np.zeros((len(y)))
    # print('X')
    # print(X)
    # print(X.shape)
    for org_ids, dst_ids in zip(org_split_ids, dst_split_ids):
        # print('org_ids, dst_ids :', org_ids, dst_ids)
        for i, (dst, org) in enumerate(zip(org_ids, dst_ids)):
            # print('dst, org', dst, org)
            X_org_feature = np.array([matrix[org]])
            # print('X_org_feature')
            # print(X_org_feature)
            # print(X_org_feature.shape)
            X_dst_feature = np.array([matrix[dst]])
            # print('X_dst_feature')
            # print(X_dst_feature)
            # print(X_dst_feature.shape)
            # print('dst, org', dst, org)
            # print('X_org_feature')
            # print(X_org_feature)
            # print('X_dst_feature')
            # print(X_dst_feature)
            X[dst] = np.array([np.dot(x, y) for x, y in zip(X_org_feature, X_dst_feature)])

    X = X[:, np.newaxis]
    print('np.newaxis')
    print(X)

    print('X')
    print(X)
    print(X.shape)
    print('y')
    print(y.shape)

    scaler = MinMaxScaler()
    X_scaler = scaler.fit_transform(X)
    print('StandardScaler')
    print(X_scaler)

    X_train, X_test, y_train, y_test = train_test_split(
        X_scaler, y, test_size=0.2, random_state=0
    )

    clf = LogisticRegression(
        random_state=0,
        solver='liblinear',
    ).fit(X_train, y_train)

    joblib.dump(clf, 'lr.pkl')

    # ------
    # eval
    # ------
    print(clf.score(X_test, y_test))
    pred = clf.predict(X_test)
    print(accuracy_score(pred, y_test))
    print(classification_report(pred, y_test))
    print(confusion_matrix(pred, y_test))


    print(clf.predict_proba(X_test))
Beispiel #4
0
def main():
    data_dir = _LOG_DIR
    pos_dir = _POS_DIR
    rne_dir = _RNE_DIR
    word_list = cm.generate_wordlist(data_dir)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    corpus = np.array([word_to_id[w] for w in word_list])
    vocab_size = len(id_to_word)
    feature = fs.extract_feature('trigram', data_dir, pos_dir, rne_dir)

    ppmi_score = vec.ppmi(feature)
    print('ppmi')
    print(ppmi_score)

    vectors = compute_svd(ppmi_score, 2)
    print('vectors')
    print(vectors)
    print(vectors.shape)

    # vec.plot_vector(ppmi_score, word_to_id)

    sample_querys = ['油抜き']
    querys = '油抜き'
    vec.rank_similarity(ppmi_score, sample_querys, word_to_id, id_to_word)
    query_id = word_to_id[querys]
    query_vec = vectors[query_id]
    print('query_vec')
    print(query_vec)

    json_list = os.listdir(_AC_DIR)
    row = len(json_list)
    column = vectors.shape[1]
    print(row)
    print(column)

    all_recipe_vector = np.zeros((row, column))
    for idx, j in enumerate(json_list):
        recipe_score = np.zeros(column)
        jsonfile = os.path.join(_AC_DIR, j)
        with open(jsonfile, 'r', encoding='utf-8') as r:
            jsondata = json.load(r)
        print('idx')
        print(idx)
        print(jsonfile)
        print('jsondata')
        print(jsondata)
        for k, v in jsondata.items():
            print('key')
            print(k)
            print('value')
            print(v)
            ############################################
            # TODO join word is not include word_to_id #
            ############################################
            try:
                query_id = word_to_id[k]
                query_vector = vectors[query_id]
                print('query_vector')
                print(query_vector)
            except KeyError:
                print('{} is not included in word_to_id'.format(k))
                time.sleep(3)
                continue

            print('recipe_score', recipe_score)
            print('v', v)
            recipe_score += query_vector * v
        all_recipe_vector[idx][0] = recipe_score[0]
        all_recipe_vector[idx][1] = recipe_score[1]
    print(all_recipe_vector)

    kmeans = KMeans(n_clusters=3,
                    random_state=_RANDOM_SEED).fit(all_recipe_vector)
    print('label')
    print(kmeans.labels_)
    labels = kmeans.labels_

    for feature, label in zip(all_recipe_vector, labels):
        plt.scatter(feature[0], feature[1], c=_COLOR[label])
    plt.show()