# coding: utf-8 import sys sys.path.append('..') from common.util import preprocess, create_co_matrix, most_similar text = 'You say goodbye and I say helllo.' corpus, word_to_id, id_to_word = preprocess(text) vocab_size = len(word_to_id) C = create_co_matrix(corpus, vocab_size) most_similar('you', word_to_id, id_to_word, C, top=5)
import sys from common.trainer import Trainer from common.optimizer import Adam from common.util import preprocess, create_contexts_target, convert_one_hot from my_ch03.simple_cbow import SimpleCBOW window_size = 1 hidden_size = 3 batch_size = 3 max_epoch = 1000 text = "You say goodbye and I say hello." corpus, w2i, i2w = preprocess(text) vocab_size = len(w2i) contexts, target = create_contexts_target(corpus, window_size) target = convert_one_hot(target, vocab_size) contexts = convert_one_hot(contexts, vocab_size) model = SimpleCBOW(vocab_size, hidden_size) optimizer = Adam() trainer = Trainer(model, optimizer) trainer.fit(contexts, target, max_epoch, batch_size) trainer.plot()
import sys sys.path.append('..') import numpy as np import matplotlib.pyplot as plt from common.util import preprocess,create_to_matrix,cos_similarity,most_similar,ppmi from dataset import ptb text='You say goodbye and I say hello.' corpus,wordtoid,idtoword=preprocess(text) #手作り C=np.array([ [0,1,0,0,0,0,0], [1,0,1,0,1,1,0], [0,1,0,1,0,0,0], [0,0,1,0,1,0,0], [0,1,0,1,0,0,0], [0,1,0,0,0,0,1], [0,0,0,0,0,1,0], ],dtype=np.int32) print(C[0]) print(C[4]) print(C[wordtoid['goodbye']]) vocab_size=len(wordtoid) C=create_to_matrix(corpus,vocab_size,window_size=1) #similarity c0=C[wordtoid['you']] c1=C[wordtoid['i']] print(cos_similarity(c0,c1))
import sys sys.path.append('..') from common.trainer import Trainer from common.optimizer import Adam from simple_cbow import SimpleCBOW from simple_skip_gram import SimpleSkipGram from common.util import preprocess, create_contexts_target, convert_one_hot window_size = 1 hidden_size = 5 batch_size = 3 max_epoch = 1000 text = "You say goodbye and I say hello." corpus, word_to_id, id_to_word = preprocess( text) # corpus: 문장에서 단어의 label ('.'포함), vocab_size = len(word_to_id) # 문장에서 단어 종류의 수, 7 contexts, target = create_contexts_target( corpus, window_size) # window_size만큼 단어들의 문맥, 단어 label target = convert_one_hot(target, vocab_size) #(6,7) target을 vocab_size에 맞게 one_hot contexts = convert_one_hot(contexts, vocab_size) # (6,2,7) print(contexts) exit(1) #model = SimpleSkipGram(vocab_size, hidden_size) model = SimpleCBOW(vocab_size, hidden_size) optimizer = Adam() trainer = Trainer(model, optimizer)
import sys, os sys.path.append(os.pardir) from common.np import np from common.util import preprocess, create_co_matrix, ppmi text = 'you say goodbye and I say hello.' corpus, w2id, id2w = preprocess(text) vocab_size = len(w2id) C = create_co_matrix(corpus, vocab_size) W = ppmi(C) U, S, V = np.linalg.svd(W) print('W=>' + str(W.shape)) print(W) print('U=>' + str(U.shape)) print(np.round(U, 3)) print('S=>' + str(S.shape)) print(np.round(S, 3)) print('V=>' + str(V.shape)) print(np.round(V, 3))
def testPreprocess(self): expected_result = r"""# 1 "test_files\\main.c" # 1 "<built-in>" # 1 "<command-line>" # 1 "test_files\\main.c" # 1 "D:\\Machine Learning\\SyntaxErrorRecoveryFramework\\pycparser\\utils\\fake_libc_include/stdio.h" 1 # 1 "D:\\Machine Learning\\SyntaxErrorRecoveryFramework\\pycparser\\utils\\fake_libc_include/_fake_defines.h" 1 # 41 "D:\\Machine Learning\\SyntaxErrorRecoveryFramework\\pycparser\\utils\\fake_libc_include/_fake_defines.h" typedef int va_list; # 2 "D:\\Machine Learning\\SyntaxErrorRecoveryFramework\\pycparser\\utils\\fake_libc_include/stdio.h" 2 # 1 "D:\\Machine Learning\\SyntaxErrorRecoveryFramework\\pycparser\\utils\\fake_libc_include/_fake_typedefs.h" 1 typedef int size_t; typedef int __builtin_va_list; typedef int __gnuc_va_list; typedef int __int8_t; typedef int __uint8_t; typedef int __int16_t; typedef int __uint16_t; typedef int __int_least16_t; typedef int __uint_least16_t; typedef int __int32_t; typedef int __uint32_t; typedef int __int64_t; typedef int __uint64_t; typedef int __int_least32_t; typedef int __uint_least32_t; typedef int __s8; typedef int __u8; typedef int __s16; typedef int __u16; typedef int __s32; typedef int __u32; typedef int __s64; typedef int __u64; typedef int _LOCK_T; typedef int _LOCK_RECURSIVE_T; typedef int _off_t; typedef int __dev_t; typedef int __uid_t; typedef int __gid_t; typedef int _off64_t; typedef int _fpos_t; typedef int _ssize_t; typedef int wint_t; typedef int _mbstate_t; typedef int _flock_t; typedef int _iconv_t; typedef int __ULong; typedef int __FILE; typedef int ptrdiff_t; typedef int wchar_t; typedef int __off_t; typedef int __pid_t; typedef int __loff_t; typedef int u_char; typedef int u_short; typedef int u_int; typedef int u_long; typedef int ushort; typedef int uint; typedef int clock_t; typedef int time_t; typedef int daddr_t; typedef int caddr_t; typedef int ino_t; typedef int off_t; typedef int dev_t; typedef int uid_t; typedef int gid_t; typedef int pid_t; typedef int key_t; typedef int ssize_t; typedef int mode_t; typedef int nlink_t; typedef int fd_mask; typedef int _types_fd_set; typedef int clockid_t; typedef int timer_t; typedef int useconds_t; typedef int suseconds_t; typedef int FILE; typedef int fpos_t; typedef int cookie_read_function_t; typedef int cookie_write_function_t; typedef int cookie_seek_function_t; typedef int cookie_close_function_t; typedef int cookie_io_functions_t; typedef int div_t; typedef int ldiv_t; typedef int lldiv_t; typedef int sigset_t; typedef int __sigset_t; typedef int _sig_func_ptr; typedef int sig_atomic_t; typedef int __tzrule_type; typedef int __tzinfo_type; typedef int mbstate_t; typedef int sem_t; typedef int pthread_t; typedef int pthread_attr_t; typedef int pthread_mutex_t; typedef int pthread_mutexattr_t; typedef int pthread_cond_t; typedef int pthread_condattr_t; typedef int pthread_key_t; typedef int pthread_once_t; typedef int pthread_rwlock_t; typedef int pthread_rwlockattr_t; typedef int pthread_spinlock_t; typedef int pthread_barrier_t; typedef int pthread_barrierattr_t; typedef int jmp_buf; typedef int rlim_t; typedef int sa_family_t; typedef int sigjmp_buf; typedef int stack_t; typedef int siginfo_t; typedef int z_stream; typedef int int8_t; typedef int uint8_t; typedef int int16_t; typedef int uint16_t; typedef int int32_t; typedef int uint32_t; typedef int int64_t; typedef int uint64_t; typedef int int_least8_t; typedef int uint_least8_t; typedef int int_least16_t; typedef int uint_least16_t; typedef int int_least32_t; typedef int uint_least32_t; typedef int int_least64_t; typedef int uint_least64_t; typedef int int_fast8_t; typedef int uint_fast8_t; typedef int int_fast16_t; typedef int uint_fast16_t; typedef int int_fast32_t; typedef int uint_fast32_t; typedef int int_fast64_t; typedef int uint_fast64_t; typedef int intptr_t; typedef int uintptr_t; typedef int intmax_t; typedef int uintmax_t; typedef _Bool bool; typedef int va_list; typedef void* MirEGLNativeWindowType; typedef void* MirEGLNativeDisplayType; typedef struct MirConnection MirConnection; typedef struct MirSurface MirSurface; typedef struct MirSurfaceSpec MirSurfaceSpec; typedef struct MirScreencast MirScreencast; typedef struct MirPromptSession MirPromptSession; typedef struct MirBufferStream MirBufferStream; typedef struct MirPersistentId MirPersistentId; typedef struct MirBlob MirBlob; typedef struct MirDisplayConfig MirDisplayConfig; typedef struct xcb_connection_t xcb_connection_t; typedef uint32_t xcb_window_t; typedef uint32_t xcb_visualid_t; # 3 "D:\\Machine Learning\\SyntaxErrorRecoveryFramework\\pycparser\\utils\\fake_libc_include/stdio.h" 2 # 2 "test_files\\main.c" 2 int main() { printf("a test\n"); return 0; } """ self.assertEqual( util.preprocess(os.path.join('test_files', 'main.c'), ), expected_result, "preprocess failed")