def main(): # --------------- # create corpus # --------------- word_list = cm.generate_wordlist(_CORPUS_DIR) word_to_id, id_to_word = cm.generate_word_id_map(word_list) cm.id_to_word_to_txt(id_to_word) corpus = np.array([word_to_id[w] for w in word_list]) vocab_size = len(id_to_word) matrix = cm.create_co_matrix(corpus, vocab_size, id_to_word) print('matrix') print(matrix) # ----------- # evaluation # ----------- file_list = [os.path.join(_RESULT_DIR, f) for f in os.listdir(_RESULT_DIR)] filenames = [os.path.basename(f) for f in file_list] dirnames = [os.path.join(_DST_DIR, split_fname_ext(f)) for f in filenames] print('dirnames') print(dirnames) make_dirs = [ mkdir_if_not_exists(os.path.join(_DST_DIR, d)) for d in dirnames ] evaluation = [ eval_run(_RESULT_DIR, f, word_to_id, matrix, d) for f, d in zip(file_list, dirnames) ]
def main(): # ------------- # load dataset # ------------- make_dirs = [_LIKELIHOOD_DIR, _TRAIN_DIR] for i in make_dirs: if os.path.isdir(i) is False: os.makedirs(i) else: pass category_data = _CATEGORY_DATA rne_map = convert_txt_to_dict(category_data) print('rne_map') print(rne_map) train_data_list = os.listdir(_LOG_DIR) print('train_data') dst_filepath = os.path.join(_LOG_DIR, 'all.csv') all_df = pd.DataFrame({}) for f in train_data_list: # if f == 'all.csv': # print('already exist all.csv') # sys.exit(1) print(f) read_filepath = os.path.join(_LOG_DIR, f) preprocess_df = data_preprocessing(read_filepath, _COLUMNS) df_dependency_tag = convert_id_to_rne(preprocess_df) print(df_dependency_tag) df_concat = pd.concat([preprocess_df, df_dependency_tag], axis=1) print(df_concat.tail) target_list = [ 'new_tag', 'new_word', 'dependency_tag', 'dependency_dst' ] target_df = df_concat[target_list] all_df = pd.concat([all_df, target_df], axis=0) all_df.to_csv(dst_filepath, index=False) df = all_df print('all_df') print(all_df) print('df') print(df.head()) del all_df # ---------------------------- # create corpus and co-matrix # ---------------------------- word_list = cm.generate_wordlist(_RECIPE_DIR) word_to_id, id_to_word = cm.generate_word_id_map(word_list) cm.id_to_word_to_txt(id_to_word) corpus = np.array([word_to_id[w] for w in word_list]) vocab_size = len(id_to_word) matrix = cm.create_co_matrix(corpus, vocab_size, id_to_word) print(matrix)
def extract_feature(feature_name, data_dir=None, pos_dir=None, rne_dir=None): ''' ----- Input ----- feature_name: 'trigram': feature_by_trigram(vocab_size, corpus), 'sentence': feature_by_sentence(vocab_size, word_to_id, data_dir), 'procedure': feature_by_procedure(vocab_size, word_to_id, data_dir), 'agent': feature_by_pos(vocab_size, word_to_id, data_dir, pos_dir, 'agent'), 'target': feature_by_pos(vocab_size, word_to_id, data_dir, pos_dir, 'target'), 'dest': feature_by_pos(vocab_size, word_to_id, data_dir, pos_dir, 'dest'), 'comp': feature_by_pos(vocab_size, wor:d_to_id, data_dir, pos_dir, 'comp'), 'action': feature_by_action(vocab_size, word_to_id, data_dir, rne_dir), data_dir: filepath which include result of Morphological analysis pos_dir: filepath which include text that split words rne_dir: filepath which include result of RNE analysis ------ Output ------ One-hot Vector: nd.array((vocab_size, vocab_size)) ''' word_list = cm.generate_wordlist(data_dir) word_to_id, id_to_word = cm.generate_word_id_map(word_list) corpus = np.array([word_to_id[w] for w in word_list]) vocab_size = len(id_to_word) print('feature_name') print(feature_name) feature_fn = get_feature_fn(feature_name) if feature_name == 'trigram': feature = feature_fn(vocab_size, corpus) elif feature_name == 'sentence' or feature_name == 'procedure': feature = feature_fn(vocab_size, word_to_id, data_dir) elif feature_name == 'action': feature = feature_fn(vocab_size, word_to_id, data_dir, rne_dir) else: feature = feature_fn(vocab_size, word_to_id, data_dir, pos_dir, feature_name) return feature
def main(): data_dir = _LOG_DIR pos_dir = _POS_DIR rne_dir = _RNE_DIR word_list = cm.generate_wordlist(data_dir) word_to_id, id_to_word = cm.generate_word_id_map(word_list) corpus = np.array([word_to_id[w] for w in word_list]) vocab_size = len(id_to_word) feature = fs.extract_feature('trigram', data_dir, pos_dir, rne_dir) ppmi_score = ppmi(feature) print('ppmi') print(ppmi_score) plot_vector(ppmi_score, word_to_id) querys = ['切'] rank_similarity(ppmi_score, querys, word_to_id, id_to_word)
def feature_by_sentence(vocab_size, word_to_id, data_dir): ''' # --------------------------- # feature separate sentence # --------------------------- # separate by sentence ''' print('################ sentence ################') word_list = cm.generate_wordlist(data_dir) sentence_array = separate_sentence(word_list) sentence_array_id = np.array( [np.array([word_to_id[w] for w in l]) for l in sentence_array]) feature = is_exist_word( vocab_size, sentence_array_id, ) del sentence_array, sentence_array_id, word_list gc.collect() return feature
def main(): # --------------- # create feature # --------------- word_list = cm.generate_wordlist(_LOG_DIR) word_to_id, id_to_word = cm.generate_word_id_map(word_list) matrix = fs.extract_feature('agent', _LOG_DIR, _POS_DIR, _RNE_DIR) print('matrix') print(matrix) # ----------- # evaluation # ----------- file_list = [os.path.join(_RESULT_DIR, f) for f in os.listdir(_RESULT_DIR)] filenames = [os.path.basename(f) for f in file_list] dirnames = [os.path.join(_DST_DIR, split_fname_ext(f)) for f in filenames] print('dirnames') print(dirnames) make_dirs = [mkdir_if_not_exists(os.path.join(_DST_DIR, d)) for d in dirnames] evaluation = [eval_run(_RESULT_DIR, f, word_to_id, matrix, d) for f, d in zip(file_list, dirnames)]
def main(): word_list = cm.generate_wordlist(_CORPUS_DIR) word_to_id, id_to_word = cm.generate_word_id_map(word_list) cm.id_to_word_to_txt(id_to_word) corpus = np.array([word_to_id[w] for w in word_list]) vocab_size = len(id_to_word) df = pd.read_csv('lr_train_20190425.csv') print(df.head()) # # --------------------------- # # adjust the number of data # # --------------------------- # df_0 = df[df['label'] == 0] # df_1 = df[df['label'] == 1] # print('0') # print(len(df_0)) # print('1') # print(len(df_1)) # X_0 = df_0[:4000] # X_1 = df_1 # df = pd.concat([X_0, X_1]) # print(len(df)) # # --------------------------- # ------- # train # ------- X_org_word = df['org'].values X_dst_word = df['dst'].values y = df['label'].values X_org_to_id = np.array([word_to_id[x] for x in X_org_word]) X_dst_to_id = np.array([word_to_id[x] for x in X_dst_word]) print('X_org_to_id') print(type(X_org_to_id)) print('X_dst_to_id') print(type(X_dst_to_id)) print('X_ort_to_id') print(X_org_to_id) print('X_dst_to_id') print(X_dst_to_id) del df del X_org_word, X_dst_word del word_to_id, id_to_word gc.collect() matrix = fs.extract_feature(_CORPUS_DIR, 'procedure') print('matrix') print(matrix) print('matrix shape') print(matrix.shape) org_split_ids = np.array_split(X_org_to_id, 10) dst_split_ids = np.array_split(X_dst_to_id, 10) # print('org_split_ids') # print(org_split_ids) # print(len(org_split_ids)) # print('dst_split_ids') # print(dst_split_ids) # print(len(dst_split_ids)) X = np.zeros((len(y))) # print('X') # print(X) # print(X.shape) for org_ids, dst_ids in zip(org_split_ids, dst_split_ids): # print('org_ids, dst_ids :', org_ids, dst_ids) for i, (dst, org) in enumerate(zip(org_ids, dst_ids)): # print('dst, org', dst, org) X_org_feature = np.array([matrix[org]]) # print('X_org_feature') # print(X_org_feature) # print(X_org_feature.shape) X_dst_feature = np.array([matrix[dst]]) # print('X_dst_feature') # print(X_dst_feature) # print(X_dst_feature.shape) # print('dst, org', dst, org) # print('X_org_feature') # print(X_org_feature) # print('X_dst_feature') # print(X_dst_feature) X[dst] = np.array([np.dot(x, y) for x, y in zip(X_org_feature, X_dst_feature)]) X = X[:, np.newaxis] print('np.newaxis') print(X) print('X') print(X) print(X.shape) print('y') print(y.shape) scaler = MinMaxScaler() X_scaler = scaler.fit_transform(X) print('StandardScaler') print(X_scaler) X_train, X_test, y_train, y_test = train_test_split( X_scaler, y, test_size=0.2, random_state=0 ) clf = LogisticRegression( random_state=0, solver='liblinear', ).fit(X_train, y_train) joblib.dump(clf, 'lr.pkl') # ------ # eval # ------ print(clf.score(X_test, y_test)) pred = clf.predict(X_test) print(accuracy_score(pred, y_test)) print(classification_report(pred, y_test)) print(confusion_matrix(pred, y_test)) print(clf.predict_proba(X_test))
def main(): word_list = cm.generate_wordlist(_CORPUS_DIR) word_to_id, id_to_word = cm.generate_word_id_map(word_list) cm.id_to_word_to_txt(id_to_word) corpus = np.array([word_to_id[w] for w in word_list]) vocab_size = len(id_to_word) matrix = cm.create_co_matrix(corpus, vocab_size, id_to_word) print('matrix') print(matrix) df = pd.read_csv('lr_train_20190425.csv') print(df.head()) # # --------------------------- # # adjust the number of data # # --------------------------- # df_0 = df[df['label'] == 0] # df_1 = df[df['label'] == 1] # print('0') # print(len(df_0)) # print('1') # print(len(df_1)) # X_0 = df_0[:4000] # X_1 = df_1 # df = pd.concat([X_0, X_1]) # print(len(df)) # # --------------------------- # ------- # train # ------- X_org_word = df['org'].values X_dst_word = df['dst'].values y = df['label'].values # print('X_org_word') # print(X_org_word) # print('X_dst_word') # print(X_dst_word) # print('y') # print(y) X_org_to_id = np.array([word_to_id[x] for x in X_org_word]) X_dst_to_id = np.array([word_to_id[x] for x in X_dst_word]) # print('X_org_to_id') # print(X_org_to_id) # print('X_dst_to_id') # print(X_dst_to_id) X_org_feature = np.array([matrix[x] for x in X_org_to_id]) X_dst_feature = np.array([matrix[x] for x in X_dst_to_id]) # print('X_org_feature') # print(X_org_feature) # print('X_dst_feature') # print(X_dst_feature) X = np.array([np.dot(x, y) for x, y in zip(X_org_feature, X_dst_feature)]) X = X[:, np.newaxis] # print('X') # print(X) # print(len(X)) scaler = MinMaxScaler() X_scaler = scaler.fit_transform(X) print('StandardScaler') print(X_scaler) X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.2, random_state=0) clf = LogisticRegression( random_state=0, solver='liblinear', ).fit(X_train, y_train) joblib.dump(clf, 'lr.pkl') # ------ # eval # ------ print(clf.score(X_test, y_test)) pred = clf.predict(X_test) print(accuracy_score(pred, y_test)) print(classification_report(pred, y_test)) print(confusion_matrix(pred, y_test)) print(clf.predict_proba(X_test))
def main(): # ----------------------------- # create corpus and co-matirx # ----------------------------- word_list = cm.generate_wordlist(_RECIPE_DIR) word_to_id, id_to_word = cm.generate_word_id_map(word_list) # output corpus to txt cm.id_to_word_to_txt(id_to_word) corpus = np.array([word_to_id[w] for w in word_list]) vocab_size = len(id_to_word) matrix = cm.create_co_matrix(corpus, vocab_size, id_to_word) print('matrix') print(matrix) # --------------------- # generate label data # --------------------- label = np.array([x for x in id_to_word.values()]) label = label[:, np.newaxis] print('label') print(label) # ------------------------ # generate category data # ------------------------ category_label_data = generate_arc_category_data(_ANNOTATION_DIR) unique_category = category_label_data['arclabel'].unique() print(category_label_data.head()) print(category_label_data.tail()) print(unique_category) # ---------------------------- # generate feature and label # ---------------------------- category_label_data['feature_org_idx'] = category_label_data['new_word']\ .apply(lambda x: word_to_id[x]) category_label_data['feature_dst_idx'] = category_label_data['dependency_dst']\ .apply(lambda x: word_to_id[x]) category_label_data['feature_org'] = category_label_data['feature_org_idx']\ .apply(lambda x: matrix[x]) category_label_data['feature_dst'] = category_label_data['feature_dst_idx']\ .apply(lambda x: matrix[x]) print('category_label_data') print(category_label_data) extend_feature = extend_columns( category_label_data['feature_org'], category_label_data['feature_dst'] ) print('extend_feature') print(extend_feature) print(extend_feature.shape) X = extend_feature category_map = category_mapping(unique_category) print('category_map') print(category_map) category_label = category_label_data['arclabel'].values category_label = category_label.flatten() print('category_label') print(category_label) y = convert_category_to_numerical(category_label, category_map) print('y') print(y) # ---------- # training # ---------- print('dataset size') print('X: {0} , y:{1}'.format(X.shape, y.shape)) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0 ) clf = SVC(kernel='linear', C=1).fit(X_train, y_train) t0 = time.time() clf.fit(X_train, y_train) joblib.dump(word_to_id, 'word_to_id.pkl') joblib.dump(matrix, 'matrix.pkl') joblib.dump(clf, 'svc.pkl') t1 = time.time() print('exec time : {}'.format(t1 - t0)) # ------------ # validation # ------------ prediction_map = {k: v for v, k in category_map.items()} joblib.dump(prediction_map, 'prediction_map.pkl') print(clf.score(X_test, y_test)) print(confusion_matrix(y_test, clf.predict(X_test))) print(classification_report( y_test, clf.predict(X_test), # target_names=category_map.values() )) # # tamanegi test # print('**************** tamanegi-surioro ****************') # onion_id = word_to_id['玉ねぎ'] # print('onion_id') # print(onion_id) # suri_id = word_to_id['すりおろ'] # print('suri_id') # print(suri_id) # onion_feature = matrix[0] # suri_feature = matrix[2] # sample_feature = np.hstack((onion_feature, suri_feature)).flatten() # print('sample_feature') # print(sample_feature) # print(clf.predict([sample_feature])) # pred = clf.predict([sample_feature]) # print(prediction_map[pred[0]]) # model load load_model = joblib.load('svc.pkl') print('load_model') print(load_model)
def main(): data_dir = _LOG_DIR pos_dir = _POS_DIR rne_dir = _RNE_DIR word_list = cm.generate_wordlist(data_dir) word_to_id, id_to_word = cm.generate_word_id_map(word_list) corpus = np.array([word_to_id[w] for w in word_list]) vocab_size = len(id_to_word) feature = fs.extract_feature('trigram', data_dir, pos_dir, rne_dir) ppmi_score = vec.ppmi(feature) print('ppmi') print(ppmi_score) vectors = compute_svd(ppmi_score, 2) print('vectors') print(vectors) print(vectors.shape) # vec.plot_vector(ppmi_score, word_to_id) sample_querys = ['油抜き'] querys = '油抜き' vec.rank_similarity(ppmi_score, sample_querys, word_to_id, id_to_word) query_id = word_to_id[querys] query_vec = vectors[query_id] print('query_vec') print(query_vec) json_list = os.listdir(_AC_DIR) row = len(json_list) column = vectors.shape[1] print(row) print(column) all_recipe_vector = np.zeros((row, column)) for idx, j in enumerate(json_list): recipe_score = np.zeros(column) jsonfile = os.path.join(_AC_DIR, j) with open(jsonfile, 'r', encoding='utf-8') as r: jsondata = json.load(r) print('idx') print(idx) print(jsonfile) print('jsondata') print(jsondata) for k, v in jsondata.items(): print('key') print(k) print('value') print(v) ############################################ # TODO join word is not include word_to_id # ############################################ try: query_id = word_to_id[k] query_vector = vectors[query_id] print('query_vector') print(query_vector) except KeyError: print('{} is not included in word_to_id'.format(k)) time.sleep(3) continue print('recipe_score', recipe_score) print('v', v) recipe_score += query_vector * v all_recipe_vector[idx][0] = recipe_score[0] all_recipe_vector[idx][1] = recipe_score[1] print(all_recipe_vector) kmeans = KMeans(n_clusters=3, random_state=_RANDOM_SEED).fit(all_recipe_vector) print('label') print(kmeans.labels_) labels = kmeans.labels_ for feature, label in zip(all_recipe_vector, labels): plt.scatter(feature[0], feature[1], c=_COLOR[label]) plt.show()
def main(): # ------------- # load dataset # ------------- make_dirs = [_LIKELIHOOD_DIR, _TRAIN_DIR] for i in make_dirs: if os.path.isdir(i) is False: os.makedirs(i) else: pass category_data = _CATEGORY_DATA rne_map = convert_txt_to_dict(category_data) print('rne_map') print(rne_map) train_data_list = os.listdir(_LOG_DIR) print('train_data') dst_filepath = os.path.join(_LOG_DIR, 'all.csv') all_df = pd.DataFrame({}) for f in train_data_list: # if f == 'all.csv': # print('already exist all.csv') # sys.exit(1) print(f) read_filepath = os.path.join(_LOG_DIR, f) preprocess_df = data_preprocessing(read_filepath, _COLUMNS) df_dependency_tag = convert_id_to_rne(preprocess_df) print(df_dependency_tag) df_concat = pd.concat([preprocess_df, df_dependency_tag], axis=1) print(df_concat.tail) target_list = [ 'new_tag', 'new_word', 'dependency_tag', 'dependency_dst' ] target_df = df_concat[target_list] all_df = pd.concat([all_df, target_df], axis=0) all_df.to_csv(dst_filepath, index=False) dst_file = os.path.join(_TRAIN_DIR, 'lr_train.csv') df = all_df print('all_df') print(all_df) print('df') print(df.head()) del all_df # ---------------------------- # create corpus and co-matrix # ---------------------------- word_list = cm.generate_wordlist(_RECIPE_DIR) word_to_id, id_to_word = cm.generate_word_id_map(word_list) cm.id_to_word_to_txt(id_to_word) corpus = np.array([word_to_id[w] for w in word_list]) vocab_size = len(id_to_word) matrix = cm.create_co_matrix(corpus, vocab_size, id_to_word) print(matrix) # ------------------------- # label to one-hot-encode # ------------------------- enc = OneHotEncoder() label_data = df['new_tag'].values label_reshape = label_data[:, np.newaxis] print('label_data') print(label_data) enc.fit(label_reshape) onehotlabel = enc.transform(label_reshape).toarray() print('onehotlabel') print(onehotlabel) # ------------------------------------ # join feature and one-hot-encode # ------------------------------------ category_label_data = df category_label_data['feature_org_idx'] = category_label_data['new_word']\ .apply(lambda x: word_to_id[x]) category_label_data['feature_org'] = category_label_data['feature_org_idx']\ .apply(lambda x: matrix[x]) feature_matrix = category_label_data['feature_org'].values train_feature_matrix = np.array([x.flatten() for x in feature_matrix]) print('train_feature_matrix') print(train_feature_matrix) print(train_feature_matrix.shape) print(onehotlabel.shape) train_data = np.hstack((train_feature_matrix, onehotlabel)) print(train_data) print(train_data.shape)