コード例 #1
0
 def __init__(self):
     """
     Thai NER
     """
     self.data_path = get_file('thainer')
     if self.data_path==None:
         download('thainer')
         self.data_path = get_file('thainer')
     self.crf=sklearn_crfsuite.CRF(
         algorithm='lbfgs',
         c1=0.1,
         c2=0.1,
         max_iterations=500,
         all_possible_transitions=True,
         model_filename=self.data_path)
コード例 #2
0
def get_path(fname):
    path = get_file(fname)
    if path == None:
        download(fname)
        path = get_file(fname)
    return (path)
コード例 #3
0
ファイル: thai2vec-checkpoint.py プロジェクト: zkan/pythainlp
def download():
    path = get_file('thai2vec02')
    if path == None:
        download_data('thai2vec02')
        path = get_file('thai2vec02')
    return path
コード例 #4
0
ファイル: utils.py プロジェクト: zkan/pythainlp
def get_path(fname):
	path = get_file(fname)
	if path==None:
		download(fname)
		path = get_file(fname)
	return(path)
コード例 #5
0
ファイル: thai2vec.py プロジェクト: zkan/pythainlp
def download():
	path = get_file('thai2vec02')
	if path==None:
		download_data('thai2vec02')
		path = get_file('thai2vec02')
	return path
コード例 #6
0
def get_path(fname):
    path = get_file(fname)
    if not path:
        download(fname)
        path = get_file(fname)
    return path
コード例 #7
0
ファイル: thai2rom.py プロジェクト: zkan/pythainlp
    def __init__(self):
        '''
        Thai2Rom
        '''
        self.batch_size = 64
        self.epochs = 100
        self.latent_dim = 256
        self.num_samples = 648241
        self.data_path = get_file('thai2rom-dataset')
        if self.data_path==None:
            download('thai2rom-dataset')
            self.data_path = get_file('thai2rom-dataset')
        self.input_texts = []
        self.target_texts = []
        self.input_characters = set()
        self.target_characters = set()
        with open(self.data_path, 'r', encoding='utf-8-sig') as self.f:
            self.lines = self.f.read().split('\n')
        for self.line in self.lines[: min(self.num_samples, len(self.lines) - 1)]:
            self.input_text, self.target_text = self.line.split('\t')
            if len(self.input_text)<30 and len(self.target_text)<90:
                self.target_text = '\t' + self.target_text + '\n'
                self.input_texts.append(self.input_text)
                self.target_texts.append(self.target_text)
                for self.char in self.input_text:
                    if self.char not in self.input_characters:
                        self.input_characters.add(self.char)
                for self.char in self.target_text:
                    if self.char not in self.target_characters:
                        self.target_characters.add(self.char)
        self.input_characters = sorted(list(self.input_characters))
        self.target_characters = sorted(list(self.target_characters))
        self.num_encoder_tokens = len(self.input_characters)
        self.num_decoder_tokens = len(self.target_characters)
        self.max_encoder_seq_length = max([len(self.txt) for self.txt in self.input_texts])
        self.max_decoder_seq_length = max([len(self.txt) for self.txt in self.target_texts])
        '''print('Number of samples:', len(self.input_texts))
        print('Number of unique input tokens:', self.num_encoder_tokens)
        print('Number of unique output tokens:', self.num_decoder_tokens)
        print('Max sequence length for inputs:', self.max_encoder_seq_length)
        print('Max sequence length for outputs:', self.max_decoder_seq_length)'''
        self.input_token_index = dict([(char, i) for i, char in enumerate(self.input_characters)])
        self.target_token_index = dict([(char, i) for i, char in enumerate(self.target_characters)])
        self.encoder_input_data = np.zeros((len(self.input_texts), self.max_encoder_seq_length, self.num_encoder_tokens),dtype='float32')
        for i, input_text in enumerate(self.input_texts):
            for t, char in enumerate(self.input_text):
                self.encoder_input_data[i, t, self.input_token_index[char]] = 1.
        # Restore the model and construct the encoder and decoder.
        self.filemodel=get_file('thai2rom')
        if self.filemodel==None:
            download('thai2rom')
            self.filemodel=get_file('thai2rom')
        self.model = load_model(self.filemodel)
        self.encoder_inputs = self.model.input[0]   # input_1
        self.encoder_outputs, self.state_h_enc, self.state_c_enc = self.model.layers[2].output   # lstm_1
        self.encoder_states = [self.state_h_enc, self.state_c_enc]
        self.encoder_model = Model(self.encoder_inputs, self.encoder_states)
        self.decoder_inputs = self.model.input[1]   # input_2
        self.decoder_state_input_h = Input(shape=(self.latent_dim,), name='input_3')
        self.decoder_state_input_c = Input(shape=(self.latent_dim,), name='input_4')
        self.decoder_states_inputs = [self.decoder_state_input_h, self.decoder_state_input_c]
        self.decoder_lstm = self.model.layers[3]
        self.decoder_outputs, self.state_h_dec, self.state_c_dec = self.decoder_lstm(self.decoder_inputs, initial_state=self.decoder_states_inputs)
        self.decoder_states = [self.state_h_dec, self.state_c_dec]
        self.decoder_dense = self.model.layers[4]
        self.decoder_outputs = self.decoder_dense(self.decoder_outputs)
        self.decoder_model = Model([self.decoder_inputs] + self.decoder_states_inputs,[self.decoder_outputs] + self.decoder_states)

        self.reverse_input_char_index = dict((i, char) for char, i in self.input_token_index.items())
        self.reverse_target_char_index = dict((i, char) for char, i in self.target_token_index.items())
コード例 #8
0
    def __init__(self):
        '''
        Thai2Rom
        '''
        self.batch_size = 64
        self.epochs = 100
        self.latent_dim = 256
        self.num_samples = 648241
        self.data_path = get_file('thai2rom-dataset')
        if self.data_path == None:
            download('thai2rom-dataset')
            self.data_path = get_file('thai2rom-dataset')
        self.input_texts = []
        self.target_texts = []
        self.input_characters = set()
        self.target_characters = set()
        with open(self.data_path, 'r', encoding='utf-8-sig') as self.f:
            self.lines = self.f.read().split('\n')
        for self.line in self.lines[:min(self.num_samples,
                                         len(self.lines) - 1)]:
            self.input_text, self.target_text = self.line.split('\t')
            if len(self.input_text) < 30 and len(self.target_text) < 90:
                self.target_text = '\t' + self.target_text + '\n'
                self.input_texts.append(self.input_text)
                self.target_texts.append(self.target_text)
                for self.char in self.input_text:
                    if self.char not in self.input_characters:
                        self.input_characters.add(self.char)
                for self.char in self.target_text:
                    if self.char not in self.target_characters:
                        self.target_characters.add(self.char)
        self.input_characters = sorted(list(self.input_characters))
        self.target_characters = sorted(list(self.target_characters))
        self.num_encoder_tokens = len(self.input_characters)
        self.num_decoder_tokens = len(self.target_characters)
        self.max_encoder_seq_length = max(
            [len(self.txt) for self.txt in self.input_texts])
        self.max_decoder_seq_length = max(
            [len(self.txt) for self.txt in self.target_texts])
        '''print('Number of samples:', len(self.input_texts))
        print('Number of unique input tokens:', self.num_encoder_tokens)
        print('Number of unique output tokens:', self.num_decoder_tokens)
        print('Max sequence length for inputs:', self.max_encoder_seq_length)
        print('Max sequence length for outputs:', self.max_decoder_seq_length)'''
        self.input_token_index = dict([
            (char, i) for i, char in enumerate(self.input_characters)
        ])
        self.target_token_index = dict([
            (char, i) for i, char in enumerate(self.target_characters)
        ])
        self.encoder_input_data = np.zeros(
            (len(self.input_texts), self.max_encoder_seq_length,
             self.num_encoder_tokens),
            dtype='float32')
        for i, input_text in enumerate(self.input_texts):
            for t, char in enumerate(self.input_text):
                self.encoder_input_data[i, t,
                                        self.input_token_index[char]] = 1.
        # Restore the model and construct the encoder and decoder.
        self.filemodel = get_file('thai2rom')
        if self.filemodel == None:
            download('thai2rom')
            self.filemodel = get_file('thai2rom')
        self.model = load_model(self.filemodel)
        self.encoder_inputs = self.model.input[0]  # input_1
        self.encoder_outputs, self.state_h_enc, self.state_c_enc = self.model.layers[
            2].output  # lstm_1
        self.encoder_states = [self.state_h_enc, self.state_c_enc]
        self.encoder_model = Model(self.encoder_inputs, self.encoder_states)
        self.decoder_inputs = self.model.input[1]  # input_2
        self.decoder_state_input_h = Input(shape=(self.latent_dim, ),
                                           name='input_3')
        self.decoder_state_input_c = Input(shape=(self.latent_dim, ),
                                           name='input_4')
        self.decoder_states_inputs = [
            self.decoder_state_input_h, self.decoder_state_input_c
        ]
        self.decoder_lstm = self.model.layers[3]
        self.decoder_outputs, self.state_h_dec, self.state_c_dec = self.decoder_lstm(
            self.decoder_inputs, initial_state=self.decoder_states_inputs)
        self.decoder_states = [self.state_h_dec, self.state_c_dec]
        self.decoder_dense = self.model.layers[4]
        self.decoder_outputs = self.decoder_dense(self.decoder_outputs)
        self.decoder_model = Model(
            [self.decoder_inputs] + self.decoder_states_inputs,
            [self.decoder_outputs] + self.decoder_states)

        self.reverse_input_char_index = dict(
            (i, char) for char, i in self.input_token_index.items())
        self.reverse_target_char_index = dict(
            (i, char) for char, i in self.target_token_index.items())