Exemple #1
0
def main():
    # ---------------
    # create corpus
    # ---------------
    word_list = cm.generate_wordlist(_CORPUS_DIR)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    cm.id_to_word_to_txt(id_to_word)

    corpus = np.array([word_to_id[w] for w in word_list])
    vocab_size = len(id_to_word)
    matrix = cm.create_co_matrix(corpus, vocab_size, id_to_word)
    print('matrix')
    print(matrix)

    # -----------
    # evaluation
    # -----------
    file_list = [os.path.join(_RESULT_DIR, f) for f in os.listdir(_RESULT_DIR)]
    filenames = [os.path.basename(f) for f in file_list]
    dirnames = [os.path.join(_DST_DIR, split_fname_ext(f)) for f in filenames]
    print('dirnames')
    print(dirnames)
    make_dirs = [
        mkdir_if_not_exists(os.path.join(_DST_DIR, d)) for d in dirnames
    ]
    evaluation = [
        eval_run(_RESULT_DIR, f, word_to_id, matrix, d)
        for f, d in zip(file_list, dirnames)
    ]
Exemple #2
0
def main():
    # -------------
    # load dataset
    # -------------
    make_dirs = [_LIKELIHOOD_DIR, _TRAIN_DIR]
    for i in make_dirs:
        if os.path.isdir(i) is False:
            os.makedirs(i)
        else:
            pass

    category_data = _CATEGORY_DATA
    rne_map = convert_txt_to_dict(category_data)
    print('rne_map')
    print(rne_map)

    train_data_list = os.listdir(_LOG_DIR)
    print('train_data')
    dst_filepath = os.path.join(_LOG_DIR, 'all.csv')

    all_df = pd.DataFrame({})
    for f in train_data_list:
        # if f == 'all.csv':
        #     print('already exist all.csv')
        #     sys.exit(1)
        print(f)
        read_filepath = os.path.join(_LOG_DIR, f)
        preprocess_df = data_preprocessing(read_filepath, _COLUMNS)
        df_dependency_tag = convert_id_to_rne(preprocess_df)
        print(df_dependency_tag)
        df_concat = pd.concat([preprocess_df, df_dependency_tag], axis=1)
        print(df_concat.tail)
        target_list = [
            'new_tag', 'new_word', 'dependency_tag', 'dependency_dst'
        ]
        target_df = df_concat[target_list]
        all_df = pd.concat([all_df, target_df], axis=0)
    all_df.to_csv(dst_filepath, index=False)

    df = all_df
    print('all_df')
    print(all_df)
    print('df')
    print(df.head())
    del all_df

    # ----------------------------
    # create corpus and co-matrix
    # ----------------------------
    word_list = cm.generate_wordlist(_RECIPE_DIR)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    cm.id_to_word_to_txt(id_to_word)
    corpus = np.array([word_to_id[w] for w in word_list])
    vocab_size = len(id_to_word)
    matrix = cm.create_co_matrix(corpus, vocab_size, id_to_word)
    print(matrix)
Exemple #3
0
def extract_feature(feature_name, data_dir=None, pos_dir=None, rne_dir=None):
    '''
    -----
    Input
    -----
    feature_name:
        'trigram': feature_by_trigram(vocab_size, corpus),
        'sentence': feature_by_sentence(vocab_size, word_to_id, data_dir),
        'procedure': feature_by_procedure(vocab_size, word_to_id, data_dir),
        'agent': feature_by_pos(vocab_size, word_to_id, data_dir, pos_dir, 'agent'),
        'target': feature_by_pos(vocab_size, word_to_id, data_dir, pos_dir, 'target'),
        'dest': feature_by_pos(vocab_size, word_to_id, data_dir, pos_dir, 'dest'),
        'comp': feature_by_pos(vocab_size, wor:d_to_id, data_dir, pos_dir, 'comp'),
        'action': feature_by_action(vocab_size, word_to_id, data_dir, rne_dir),
    data_dir:
        filepath which include result of Morphological analysis
    pos_dir:
        filepath which include text that split words
        rne_dir:
        filepath which include result of RNE analysis
    ------
    Output
    ------
    One-hot Vector: nd.array((vocab_size, vocab_size))
    '''
    word_list = cm.generate_wordlist(data_dir)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    corpus = np.array([word_to_id[w] for w in word_list])
    vocab_size = len(id_to_word)

    print('feature_name')
    print(feature_name)

    feature_fn = get_feature_fn(feature_name)

    if feature_name == 'trigram':
        feature = feature_fn(vocab_size, corpus)
    elif feature_name == 'sentence' or feature_name == 'procedure':
        feature = feature_fn(vocab_size, word_to_id, data_dir)
    elif feature_name == 'action':
        feature = feature_fn(vocab_size, word_to_id, data_dir, rne_dir)
    else:
        feature = feature_fn(vocab_size, word_to_id, data_dir, pos_dir,
                             feature_name)

    return feature
Exemple #4
0
def main():
    data_dir = _LOG_DIR
    pos_dir = _POS_DIR
    rne_dir = _RNE_DIR
    word_list = cm.generate_wordlist(data_dir)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    corpus = np.array([word_to_id[w] for w in word_list])
    vocab_size = len(id_to_word)
    feature = fs.extract_feature('trigram', data_dir, pos_dir, rne_dir)

    ppmi_score = ppmi(feature)
    print('ppmi')
    print(ppmi_score)

    plot_vector(ppmi_score, word_to_id)

    querys = ['切']
    rank_similarity(ppmi_score, querys, word_to_id, id_to_word)
Exemple #5
0
def feature_by_sentence(vocab_size, word_to_id, data_dir):
    '''
    # ---------------------------
    # feature separate sentence
    # ---------------------------
    # separate by sentence
    '''
    print('################ sentence ################')
    word_list = cm.generate_wordlist(data_dir)
    sentence_array = separate_sentence(word_list)
    sentence_array_id = np.array(
        [np.array([word_to_id[w] for w in l]) for l in sentence_array])

    feature = is_exist_word(
        vocab_size,
        sentence_array_id,
    )
    del sentence_array, sentence_array_id, word_list
    gc.collect()

    return feature
Exemple #6
0
def main():
    # ---------------
    # create feature
    # ---------------
    word_list = cm.generate_wordlist(_LOG_DIR)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    matrix = fs.extract_feature('agent', _LOG_DIR, _POS_DIR, _RNE_DIR)
    print('matrix')
    print(matrix)

    # -----------
    # evaluation
    # -----------
    file_list = [os.path.join(_RESULT_DIR, f) for f in os.listdir(_RESULT_DIR)]
    filenames = [os.path.basename(f) for f in file_list]
    dirnames = [os.path.join(_DST_DIR, split_fname_ext(f)) for f in filenames]
    print('dirnames')
    print(dirnames)
    make_dirs = [mkdir_if_not_exists(os.path.join(_DST_DIR, d)) for d in dirnames]
    evaluation = [eval_run(_RESULT_DIR, f, word_to_id, matrix, d)
                  for f, d in zip(file_list, dirnames)]
Exemple #7
0
def main():
    word_list = cm.generate_wordlist(_CORPUS_DIR)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    cm.id_to_word_to_txt(id_to_word)
    corpus = np.array([word_to_id[w] for w in word_list])
    vocab_size = len(id_to_word)

    df = pd.read_csv('lr_train_20190425.csv')
    print(df.head())

    # # ---------------------------
    # # adjust the number of data
    # # ---------------------------
    # df_0 = df[df['label'] == 0]
    # df_1 = df[df['label'] == 1]
    # print('0')
    # print(len(df_0))
    # print('1')
    # print(len(df_1))

    # X_0 = df_0[:4000]
    # X_1 = df_1

    # df = pd.concat([X_0, X_1])
    # print(len(df))
    # # ---------------------------

    # -------
    # train
    # -------
    X_org_word = df['org'].values
    X_dst_word = df['dst'].values
    y = df['label'].values

    X_org_to_id = np.array([word_to_id[x] for x in X_org_word])
    X_dst_to_id = np.array([word_to_id[x] for x in X_dst_word])
    print('X_org_to_id')
    print(type(X_org_to_id))
    print('X_dst_to_id')
    print(type(X_dst_to_id))

    print('X_ort_to_id')
    print(X_org_to_id)
    print('X_dst_to_id')
    print(X_dst_to_id)

    del df
    del X_org_word, X_dst_word
    del word_to_id, id_to_word
    gc.collect()

    matrix = fs.extract_feature(_CORPUS_DIR, 'procedure')
    print('matrix')
    print(matrix)
    print('matrix shape')
    print(matrix.shape)
    org_split_ids = np.array_split(X_org_to_id, 10)
    dst_split_ids = np.array_split(X_dst_to_id, 10)
    # print('org_split_ids')
    # print(org_split_ids)
    # print(len(org_split_ids))
    # print('dst_split_ids')
    # print(dst_split_ids)
    # print(len(dst_split_ids))
    X = np.zeros((len(y)))
    # print('X')
    # print(X)
    # print(X.shape)
    for org_ids, dst_ids in zip(org_split_ids, dst_split_ids):
        # print('org_ids, dst_ids :', org_ids, dst_ids)
        for i, (dst, org) in enumerate(zip(org_ids, dst_ids)):
            # print('dst, org', dst, org)
            X_org_feature = np.array([matrix[org]])
            # print('X_org_feature')
            # print(X_org_feature)
            # print(X_org_feature.shape)
            X_dst_feature = np.array([matrix[dst]])
            # print('X_dst_feature')
            # print(X_dst_feature)
            # print(X_dst_feature.shape)
            # print('dst, org', dst, org)
            # print('X_org_feature')
            # print(X_org_feature)
            # print('X_dst_feature')
            # print(X_dst_feature)
            X[dst] = np.array([np.dot(x, y) for x, y in zip(X_org_feature, X_dst_feature)])

    X = X[:, np.newaxis]
    print('np.newaxis')
    print(X)

    print('X')
    print(X)
    print(X.shape)
    print('y')
    print(y.shape)

    scaler = MinMaxScaler()
    X_scaler = scaler.fit_transform(X)
    print('StandardScaler')
    print(X_scaler)

    X_train, X_test, y_train, y_test = train_test_split(
        X_scaler, y, test_size=0.2, random_state=0
    )

    clf = LogisticRegression(
        random_state=0,
        solver='liblinear',
    ).fit(X_train, y_train)

    joblib.dump(clf, 'lr.pkl')

    # ------
    # eval
    # ------
    print(clf.score(X_test, y_test))
    pred = clf.predict(X_test)
    print(accuracy_score(pred, y_test))
    print(classification_report(pred, y_test))
    print(confusion_matrix(pred, y_test))


    print(clf.predict_proba(X_test))
def main():
    word_list = cm.generate_wordlist(_CORPUS_DIR)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    cm.id_to_word_to_txt(id_to_word)

    corpus = np.array([word_to_id[w] for w in word_list])
    vocab_size = len(id_to_word)
    matrix = cm.create_co_matrix(corpus, vocab_size, id_to_word)
    print('matrix')
    print(matrix)

    df = pd.read_csv('lr_train_20190425.csv')
    print(df.head())

    # # ---------------------------
    # # adjust the number of data
    # # ---------------------------
    # df_0 = df[df['label'] == 0]
    # df_1 = df[df['label'] == 1]
    # print('0')
    # print(len(df_0))
    # print('1')
    # print(len(df_1))

    # X_0 = df_0[:4000]
    # X_1 = df_1

    # df = pd.concat([X_0, X_1])
    # print(len(df))
    # # ---------------------------

    # -------
    # train
    # -------
    X_org_word = df['org'].values
    X_dst_word = df['dst'].values
    y = df['label'].values

    # print('X_org_word')
    # print(X_org_word)
    # print('X_dst_word')
    # print(X_dst_word)
    # print('y')
    # print(y)

    X_org_to_id = np.array([word_to_id[x] for x in X_org_word])
    X_dst_to_id = np.array([word_to_id[x] for x in X_dst_word])

    # print('X_org_to_id')
    # print(X_org_to_id)
    # print('X_dst_to_id')
    # print(X_dst_to_id)

    X_org_feature = np.array([matrix[x] for x in X_org_to_id])
    X_dst_feature = np.array([matrix[x] for x in X_dst_to_id])

    # print('X_org_feature')
    # print(X_org_feature)
    # print('X_dst_feature')
    # print(X_dst_feature)

    X = np.array([np.dot(x, y) for x, y in zip(X_org_feature, X_dst_feature)])
    X = X[:, np.newaxis]

    # print('X')
    # print(X)
    # print(len(X))

    scaler = MinMaxScaler()
    X_scaler = scaler.fit_transform(X)
    print('StandardScaler')
    print(X_scaler)

    X_train, X_test, y_train, y_test = train_test_split(X_scaler,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    clf = LogisticRegression(
        random_state=0,
        solver='liblinear',
    ).fit(X_train, y_train)

    joblib.dump(clf, 'lr.pkl')

    # ------
    # eval
    # ------
    print(clf.score(X_test, y_test))
    pred = clf.predict(X_test)
    print(accuracy_score(pred, y_test))
    print(classification_report(pred, y_test))
    print(confusion_matrix(pred, y_test))

    print(clf.predict_proba(X_test))
Exemple #9
0
def main():
    # -----------------------------
    # create corpus and co-matirx
    # -----------------------------
    word_list = cm.generate_wordlist(_RECIPE_DIR)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    # output corpus to txt
    cm.id_to_word_to_txt(id_to_word)
    corpus = np.array([word_to_id[w] for w in word_list])
    vocab_size = len(id_to_word)
    matrix = cm.create_co_matrix(corpus, vocab_size, id_to_word)
    print('matrix')
    print(matrix)

    # ---------------------
    # generate label data
    # ---------------------
    label = np.array([x for x in id_to_word.values()])
    label = label[:, np.newaxis]
    print('label')
    print(label)

    # ------------------------
    # generate category data
    # ------------------------
    category_label_data = generate_arc_category_data(_ANNOTATION_DIR)
    unique_category = category_label_data['arclabel'].unique()
    print(category_label_data.head())
    print(category_label_data.tail())
    print(unique_category)

    # ----------------------------
    # generate feature and label
    # ----------------------------
    category_label_data['feature_org_idx'] = category_label_data['new_word']\
      .apply(lambda x: word_to_id[x])
    category_label_data['feature_dst_idx'] = category_label_data['dependency_dst']\
      .apply(lambda x: word_to_id[x])
    category_label_data['feature_org'] = category_label_data['feature_org_idx']\
      .apply(lambda x: matrix[x])
    category_label_data['feature_dst'] = category_label_data['feature_dst_idx']\
      .apply(lambda x: matrix[x])
    print('category_label_data')
    print(category_label_data)

    extend_feature = extend_columns(
        category_label_data['feature_org'], category_label_data['feature_dst']
    )
    print('extend_feature')
    print(extend_feature)
    print(extend_feature.shape)
    X = extend_feature

    category_map = category_mapping(unique_category)
    print('category_map')
    print(category_map)
    category_label = category_label_data['arclabel'].values
    category_label = category_label.flatten()
    print('category_label')
    print(category_label)
    y = convert_category_to_numerical(category_label, category_map)
    print('y')
    print(y)

    # ----------
    # training
    # ----------
    print('dataset size')
    print('X: {0} , y:{1}'.format(X.shape, y.shape))
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0
    )
    clf = SVC(kernel='linear', C=1).fit(X_train, y_train)
    t0 = time.time()
    clf.fit(X_train, y_train)
    joblib.dump(word_to_id, 'word_to_id.pkl')
    joblib.dump(matrix, 'matrix.pkl')
    joblib.dump(clf, 'svc.pkl')
    t1 = time.time()
    print('exec time : {}'.format(t1 - t0))

    # ------------
    # validation
    # ------------
    prediction_map = {k: v for v, k in category_map.items()}
    joblib.dump(prediction_map, 'prediction_map.pkl')
    print(clf.score(X_test, y_test))
    print(confusion_matrix(y_test, clf.predict(X_test)))
    print(classification_report(
        y_test,
        clf.predict(X_test),
        # target_names=category_map.values()
    ))

    # # tamanegi test
    # print('**************** tamanegi-surioro ****************')
    # onion_id = word_to_id['玉ねぎ']
    # print('onion_id')
    # print(onion_id)
    # suri_id = word_to_id['すりおろ']
    # print('suri_id')
    # print(suri_id)
    # onion_feature = matrix[0]
    # suri_feature = matrix[2]
    # sample_feature = np.hstack((onion_feature, suri_feature)).flatten()
    # print('sample_feature')
    # print(sample_feature)
    # print(clf.predict([sample_feature]))
    # pred = clf.predict([sample_feature])
    # print(prediction_map[pred[0]])

    # model load
    load_model = joblib.load('svc.pkl')
    print('load_model')
    print(load_model)
Exemple #10
0
def main():
    data_dir = _LOG_DIR
    pos_dir = _POS_DIR
    rne_dir = _RNE_DIR
    word_list = cm.generate_wordlist(data_dir)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    corpus = np.array([word_to_id[w] for w in word_list])
    vocab_size = len(id_to_word)
    feature = fs.extract_feature('trigram', data_dir, pos_dir, rne_dir)

    ppmi_score = vec.ppmi(feature)
    print('ppmi')
    print(ppmi_score)

    vectors = compute_svd(ppmi_score, 2)
    print('vectors')
    print(vectors)
    print(vectors.shape)

    # vec.plot_vector(ppmi_score, word_to_id)

    sample_querys = ['油抜き']
    querys = '油抜き'
    vec.rank_similarity(ppmi_score, sample_querys, word_to_id, id_to_word)
    query_id = word_to_id[querys]
    query_vec = vectors[query_id]
    print('query_vec')
    print(query_vec)

    json_list = os.listdir(_AC_DIR)
    row = len(json_list)
    column = vectors.shape[1]
    print(row)
    print(column)

    all_recipe_vector = np.zeros((row, column))
    for idx, j in enumerate(json_list):
        recipe_score = np.zeros(column)
        jsonfile = os.path.join(_AC_DIR, j)
        with open(jsonfile, 'r', encoding='utf-8') as r:
            jsondata = json.load(r)
        print('idx')
        print(idx)
        print(jsonfile)
        print('jsondata')
        print(jsondata)
        for k, v in jsondata.items():
            print('key')
            print(k)
            print('value')
            print(v)
            ############################################
            # TODO join word is not include word_to_id #
            ############################################
            try:
                query_id = word_to_id[k]
                query_vector = vectors[query_id]
                print('query_vector')
                print(query_vector)
            except KeyError:
                print('{} is not included in word_to_id'.format(k))
                time.sleep(3)
                continue

            print('recipe_score', recipe_score)
            print('v', v)
            recipe_score += query_vector * v
        all_recipe_vector[idx][0] = recipe_score[0]
        all_recipe_vector[idx][1] = recipe_score[1]
    print(all_recipe_vector)

    kmeans = KMeans(n_clusters=3,
                    random_state=_RANDOM_SEED).fit(all_recipe_vector)
    print('label')
    print(kmeans.labels_)
    labels = kmeans.labels_

    for feature, label in zip(all_recipe_vector, labels):
        plt.scatter(feature[0], feature[1], c=_COLOR[label])
    plt.show()
Exemple #11
0
def main():
    # -------------
    # load dataset
    # -------------
    make_dirs = [_LIKELIHOOD_DIR, _TRAIN_DIR]
    for i in make_dirs:
        if os.path.isdir(i) is False:
            os.makedirs(i)
        else:
            pass

    category_data = _CATEGORY_DATA
    rne_map = convert_txt_to_dict(category_data)
    print('rne_map')
    print(rne_map)

    train_data_list = os.listdir(_LOG_DIR)
    print('train_data')
    dst_filepath = os.path.join(_LOG_DIR, 'all.csv')

    all_df = pd.DataFrame({})
    for f in train_data_list:
        # if f == 'all.csv':
        #     print('already exist all.csv')
        #     sys.exit(1)
        print(f)
        read_filepath = os.path.join(_LOG_DIR, f)
        preprocess_df = data_preprocessing(read_filepath, _COLUMNS)
        df_dependency_tag = convert_id_to_rne(preprocess_df)
        print(df_dependency_tag)
        df_concat = pd.concat([preprocess_df, df_dependency_tag], axis=1)
        print(df_concat.tail)
        target_list = [
            'new_tag', 'new_word', 'dependency_tag', 'dependency_dst'
        ]
        target_df = df_concat[target_list]
        all_df = pd.concat([all_df, target_df], axis=0)
    all_df.to_csv(dst_filepath, index=False)

    dst_file = os.path.join(_TRAIN_DIR, 'lr_train.csv')
    df = all_df
    print('all_df')
    print(all_df)
    print('df')
    print(df.head())
    del all_df

    # ----------------------------
    # create corpus and co-matrix
    # ----------------------------
    word_list = cm.generate_wordlist(_RECIPE_DIR)
    word_to_id, id_to_word = cm.generate_word_id_map(word_list)
    cm.id_to_word_to_txt(id_to_word)
    corpus = np.array([word_to_id[w] for w in word_list])
    vocab_size = len(id_to_word)
    matrix = cm.create_co_matrix(corpus, vocab_size, id_to_word)
    print(matrix)

    # -------------------------
    # label to one-hot-encode
    # -------------------------
    enc = OneHotEncoder()
    label_data = df['new_tag'].values
    label_reshape = label_data[:, np.newaxis]
    print('label_data')
    print(label_data)
    enc.fit(label_reshape)
    onehotlabel = enc.transform(label_reshape).toarray()
    print('onehotlabel')
    print(onehotlabel)

    # ------------------------------------
    # join feature and one-hot-encode
    # ------------------------------------
    category_label_data = df
    category_label_data['feature_org_idx'] = category_label_data['new_word']\
      .apply(lambda x: word_to_id[x])
    category_label_data['feature_org'] = category_label_data['feature_org_idx']\
      .apply(lambda x: matrix[x])

    feature_matrix = category_label_data['feature_org'].values
    train_feature_matrix = np.array([x.flatten() for x in feature_matrix])
    print('train_feature_matrix')
    print(train_feature_matrix)
    print(train_feature_matrix.shape)

    print(onehotlabel.shape)
    train_data = np.hstack((train_feature_matrix, onehotlabel))
    print(train_data)
    print(train_data.shape)