def __init__(self): """ Thai NER """ self.data_path = get_file('thainer') if self.data_path==None: download('thainer') self.data_path = get_file('thainer') self.crf=sklearn_crfsuite.CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=500, all_possible_transitions=True, model_filename=self.data_path)
def get_path(fname): path = get_file(fname) if path == None: download(fname) path = get_file(fname) return (path)
def download(): path = get_file('thai2vec02') if path == None: download_data('thai2vec02') path = get_file('thai2vec02') return path
def get_path(fname): path = get_file(fname) if path==None: download(fname) path = get_file(fname) return(path)
def download(): path = get_file('thai2vec02') if path==None: download_data('thai2vec02') path = get_file('thai2vec02') return path
def get_path(fname): path = get_file(fname) if not path: download(fname) path = get_file(fname) return path
def __init__(self): ''' Thai2Rom ''' self.batch_size = 64 self.epochs = 100 self.latent_dim = 256 self.num_samples = 648241 self.data_path = get_file('thai2rom-dataset') if self.data_path==None: download('thai2rom-dataset') self.data_path = get_file('thai2rom-dataset') self.input_texts = [] self.target_texts = [] self.input_characters = set() self.target_characters = set() with open(self.data_path, 'r', encoding='utf-8-sig') as self.f: self.lines = self.f.read().split('\n') for self.line in self.lines[: min(self.num_samples, len(self.lines) - 1)]: self.input_text, self.target_text = self.line.split('\t') if len(self.input_text)<30 and len(self.target_text)<90: self.target_text = '\t' + self.target_text + '\n' self.input_texts.append(self.input_text) self.target_texts.append(self.target_text) for self.char in self.input_text: if self.char not in self.input_characters: self.input_characters.add(self.char) for self.char in self.target_text: if self.char not in self.target_characters: self.target_characters.add(self.char) self.input_characters = sorted(list(self.input_characters)) self.target_characters = sorted(list(self.target_characters)) self.num_encoder_tokens = len(self.input_characters) self.num_decoder_tokens = len(self.target_characters) self.max_encoder_seq_length = max([len(self.txt) for self.txt in self.input_texts]) self.max_decoder_seq_length = max([len(self.txt) for self.txt in self.target_texts]) '''print('Number of samples:', len(self.input_texts)) print('Number of unique input tokens:', self.num_encoder_tokens) print('Number of unique output tokens:', self.num_decoder_tokens) print('Max sequence length for inputs:', self.max_encoder_seq_length) print('Max sequence length for outputs:', self.max_decoder_seq_length)''' self.input_token_index = dict([(char, i) for i, char in enumerate(self.input_characters)]) self.target_token_index = dict([(char, i) for i, char in enumerate(self.target_characters)]) self.encoder_input_data = np.zeros((len(self.input_texts), self.max_encoder_seq_length, self.num_encoder_tokens),dtype='float32') for i, input_text in enumerate(self.input_texts): for t, char in enumerate(self.input_text): self.encoder_input_data[i, t, self.input_token_index[char]] = 1. # Restore the model and construct the encoder and decoder. self.filemodel=get_file('thai2rom') if self.filemodel==None: download('thai2rom') self.filemodel=get_file('thai2rom') self.model = load_model(self.filemodel) self.encoder_inputs = self.model.input[0] # input_1 self.encoder_outputs, self.state_h_enc, self.state_c_enc = self.model.layers[2].output # lstm_1 self.encoder_states = [self.state_h_enc, self.state_c_enc] self.encoder_model = Model(self.encoder_inputs, self.encoder_states) self.decoder_inputs = self.model.input[1] # input_2 self.decoder_state_input_h = Input(shape=(self.latent_dim,), name='input_3') self.decoder_state_input_c = Input(shape=(self.latent_dim,), name='input_4') self.decoder_states_inputs = [self.decoder_state_input_h, self.decoder_state_input_c] self.decoder_lstm = self.model.layers[3] self.decoder_outputs, self.state_h_dec, self.state_c_dec = self.decoder_lstm(self.decoder_inputs, initial_state=self.decoder_states_inputs) self.decoder_states = [self.state_h_dec, self.state_c_dec] self.decoder_dense = self.model.layers[4] self.decoder_outputs = self.decoder_dense(self.decoder_outputs) self.decoder_model = Model([self.decoder_inputs] + self.decoder_states_inputs,[self.decoder_outputs] + self.decoder_states) self.reverse_input_char_index = dict((i, char) for char, i in self.input_token_index.items()) self.reverse_target_char_index = dict((i, char) for char, i in self.target_token_index.items())
def __init__(self): ''' Thai2Rom ''' self.batch_size = 64 self.epochs = 100 self.latent_dim = 256 self.num_samples = 648241 self.data_path = get_file('thai2rom-dataset') if self.data_path == None: download('thai2rom-dataset') self.data_path = get_file('thai2rom-dataset') self.input_texts = [] self.target_texts = [] self.input_characters = set() self.target_characters = set() with open(self.data_path, 'r', encoding='utf-8-sig') as self.f: self.lines = self.f.read().split('\n') for self.line in self.lines[:min(self.num_samples, len(self.lines) - 1)]: self.input_text, self.target_text = self.line.split('\t') if len(self.input_text) < 30 and len(self.target_text) < 90: self.target_text = '\t' + self.target_text + '\n' self.input_texts.append(self.input_text) self.target_texts.append(self.target_text) for self.char in self.input_text: if self.char not in self.input_characters: self.input_characters.add(self.char) for self.char in self.target_text: if self.char not in self.target_characters: self.target_characters.add(self.char) self.input_characters = sorted(list(self.input_characters)) self.target_characters = sorted(list(self.target_characters)) self.num_encoder_tokens = len(self.input_characters) self.num_decoder_tokens = len(self.target_characters) self.max_encoder_seq_length = max( [len(self.txt) for self.txt in self.input_texts]) self.max_decoder_seq_length = max( [len(self.txt) for self.txt in self.target_texts]) '''print('Number of samples:', len(self.input_texts)) print('Number of unique input tokens:', self.num_encoder_tokens) print('Number of unique output tokens:', self.num_decoder_tokens) print('Max sequence length for inputs:', self.max_encoder_seq_length) print('Max sequence length for outputs:', self.max_decoder_seq_length)''' self.input_token_index = dict([ (char, i) for i, char in enumerate(self.input_characters) ]) self.target_token_index = dict([ (char, i) for i, char in enumerate(self.target_characters) ]) self.encoder_input_data = np.zeros( (len(self.input_texts), self.max_encoder_seq_length, self.num_encoder_tokens), dtype='float32') for i, input_text in enumerate(self.input_texts): for t, char in enumerate(self.input_text): self.encoder_input_data[i, t, self.input_token_index[char]] = 1. # Restore the model and construct the encoder and decoder. self.filemodel = get_file('thai2rom') if self.filemodel == None: download('thai2rom') self.filemodel = get_file('thai2rom') self.model = load_model(self.filemodel) self.encoder_inputs = self.model.input[0] # input_1 self.encoder_outputs, self.state_h_enc, self.state_c_enc = self.model.layers[ 2].output # lstm_1 self.encoder_states = [self.state_h_enc, self.state_c_enc] self.encoder_model = Model(self.encoder_inputs, self.encoder_states) self.decoder_inputs = self.model.input[1] # input_2 self.decoder_state_input_h = Input(shape=(self.latent_dim, ), name='input_3') self.decoder_state_input_c = Input(shape=(self.latent_dim, ), name='input_4') self.decoder_states_inputs = [ self.decoder_state_input_h, self.decoder_state_input_c ] self.decoder_lstm = self.model.layers[3] self.decoder_outputs, self.state_h_dec, self.state_c_dec = self.decoder_lstm( self.decoder_inputs, initial_state=self.decoder_states_inputs) self.decoder_states = [self.state_h_dec, self.state_c_dec] self.decoder_dense = self.model.layers[4] self.decoder_outputs = self.decoder_dense(self.decoder_outputs) self.decoder_model = Model( [self.decoder_inputs] + self.decoder_states_inputs, [self.decoder_outputs] + self.decoder_states) self.reverse_input_char_index = dict( (i, char) for char, i in self.input_token_index.items()) self.reverse_target_char_index = dict( (i, char) for char, i in self.target_token_index.items())