Example #1
0
def load_model_to_app():
    # app.predictor = load_model('./static/POS_BiLSTM_CRF_WSJ_new.h5')
    with open('static/POS/tokenizer.json') as f1:
        data1 = json.load(f1)
        tokenizer = tokenizer_from_json(data1)
    app.tokenizer = tokenizer
    with open('static/POS/tag_tokenizer.json') as f2:
        data2 = json.load(f2)
        tag_tokenizer = tokenizer_from_json(data2)
    app.tag_tokenizer = tag_tokenizer

    word_index = tokenizer.word_index
    vocab_size = len(word_index) + 1
    tag_index = tag_tokenizer.word_index
    app.index_tag = {i: t for t, i in tag_index.items()}
    tag_size = len(tag_index) + 1

    model = create_model(vocab_size, max_length, embedding_dim, word_index,
                         tag_index)
    model.load_weights('static/POS/POS_BiLSTM_CRF_WSJ_new.h5')
    app.pos_tagger = model

    # sentiment analysis model
    with open('static/SA/tokenizer.json') as f3:
        data3 = json.load(f3)
        tokenizer3 = tokenizer_from_json(data3)
    app.sa_tokenizer = tokenizer3
    model3 = load_model('static/SA/model.h5')
    app.sa_model = model3
Example #2
0
def predict_sentiment(tweets):
    #load tokenizer
    with open(config.DATA / 'sentiment' / 'tokenizer_200k.json') as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)

    #compile the keras model
    embedding_dim = 100
    max_words = 200000
    max_length = 50
    lstm_model4 = Sequential()
    lstm_model4.add(
        Embedding(max_words, embedding_dim, input_length=max_length))
    lstm_model4.add(LSTM(64, return_sequences=True))
    lstm_model4.add(LSTM(32))
    lstm_model4.add(Dense(32, activation='relu'))
    #output layer
    lstm_model4.add(Dense(1, activation='sigmoid'))
    lstm_model4.compile(optimizer='adam',
                        loss='binary_crossentropy',
                        metrics=['acc'])
    lstm_model4.load_weights(config.MODELS / 'sentiment' /
                             'LSTM_model5_nostop.h5')
    #pre-process tweets to remove mentions and hashtags
    political_tweets_proc = list(map(preprocess_tweet, tweets))
    #transform the tweets to sequences of numbers
    pol_seqs = tokenizer.texts_to_sequences(political_tweets_proc)
    #pad with zeros
    pol_seqs_padded = pad_sequences(pol_seqs, maxlen=max_length)
    return lstm_model4.predict(pol_seqs_padded)
Example #3
0
    def load(self, cache):
        """Load trained model."""

        self.model = load_model(self.model_cache)
        with open(self.tokenizer_cache) as f:
            self.tokenizer = tokenizer_from_json(json.load(f))
        super().load(cache)
Example #4
0
def Vectorize2(news_list , json_string, max_words = 100):
    
    from keras.preprocessing.text import tokenizer_from_json
    from keras.preprocessing.sequence import pad_sequences
    
    maxlen = 100 
    training_samples = 200
    validation_samples = 10000

    
    # tokenizamos los textos
    tokenizer = tokenizer_from_json(json_string)
    
    # convierte los strings en una lista de los indices of tokens 
    sequences = tokenizer.texts_to_sequences(news_list)
    # print(len(sequences[0]))
    # diccionario de los tokens con sus indices
    word_index = tokenizer.word_index
    # print(len(word_index.keys()))
    # print('Found unique tokens')

    # esto conviere una lista en una matrix 2D
    data = pad_sequences(sequences, maxlen=1494)
    
    # print('Shape of data tensor:', data.shape)
    # print('Shape of label:',labels.shape)

    # hago una permutacion random de los features
    # indices = np.arange(data.shape[0])
    # np.random.shuffle(indices)
    # data = data[indices]
    # labels = labels[indices]
    # print(labels)
    return data
Example #5
0
    def test_tokenizer_serde_fitting(self):
        sample_texts = [
            "There was a time that the pieces fit, but I watched them fall away",
            "Mildewed and smoldering, strangled by our coveting",
            "I've done the math enough to know the dangers of our second guessing",
        ]
        tokenizer = text.Tokenizer(num_words=100)
        tokenizer.fit_on_texts(sample_texts)

        seq_generator = tokenizer.texts_to_sequences_generator(sample_texts)
        sequences = [seq for seq in seq_generator]
        tokenizer.fit_on_sequences(sequences)

        tokenizer_json = tokenizer.to_json()
        recovered = text.tokenizer_from_json(tokenizer_json)

        self.assertEqual(tokenizer.char_level, recovered.char_level)
        self.assertEqual(tokenizer.document_count, recovered.document_count)
        self.assertEqual(tokenizer.filters, recovered.filters)
        self.assertEqual(tokenizer.lower, recovered.lower)
        self.assertEqual(tokenizer.num_words, recovered.num_words)
        self.assertEqual(tokenizer.oov_token, recovered.oov_token)

        self.assertEqual(tokenizer.word_docs, recovered.word_docs)
        self.assertEqual(tokenizer.word_counts, recovered.word_counts)
        self.assertEqual(tokenizer.word_index, recovered.word_index)
        self.assertEqual(tokenizer.index_word, recovered.index_word)
        self.assertEqual(tokenizer.index_docs, recovered.index_docs)
Example #6
0
def get_sequence_of_tokens(corpus, refresh=True):
    """
    :param corpus:
    :param refresh:
    :return:
    """
    # tokenization
    if refresh:
        tokenizer = Tokenizer()
        # fit the tokenizer on the text
        tokenizer.fit_on_texts(corpus)
    else:
        with open("tokenizer.json", 'r') as tj:
            tokenizer = tokenizer_from_json(json.load(tj))

    tokenizer_json = tokenizer.to_json()

    with open('tokenizer.json', 'w') as fobj:
        json.dump(tokenizer_json, fobj)

    index_dict = tokenizer.word_index
    seq = tokenizer.texts_to_sequences(corpus)
    # calculate the vocab size
    total_words = len(tokenizer.word_index) + 1
    print(total_words)
    return total_words, seq
    def open_txt_tokeznizer(path):
        # open the tokenizer
        with open(str(path) + 'tokenizer.json') as f:
            data = json.load(f)
            tokenizer = tokenizer_from_json(data)

        return tokenizer
def tokenize(df):
    with open('tokenizer.json') as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)

    text_sequences = tokenizer.texts_to_sequences(df)
    text_sequences = pad_sequences(text_sequences, 200)
    return text_sequences
def load_pretrained():
    model = load_model('./model/model.h5')
    with open('./model/tokenizer.json') as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)
    print('loaded')
    sys.stdout.flush()
    return model, tokenizer
Example #10
0
def preprocess(input):
    input = [input]
    with open('tokenizer.json') as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)
    tokenized = tokenizer.texts_to_sequences(input)
    padded = sequence.pad_sequences(tokenized, maxlen=MAX_SEQUENCE_LENGTH)
    return padded
Example #11
0
    def train(self, df, verbose=False, cache=None):
        """Train the neural network."""

        labels = df['label'].to_numpy()
        self.model = load_model(self.model_cache)
        with open(self.tokenizer_cache) as f:
            self.tokenizer = tokenizer_from_json(json.load(f))
        super().train(df, labels, verbose, cache)
def loadSentenceTokenizer(filepath):
    '''
    Load the sentences tokenizer
    '''
    with open(filepath) as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)
    return tokenizer
def loadLabelTokenizer(filepath):
    '''
    Load the label tokenizer
    '''
    with open(filepath) as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)
    return tokenizer
def vqa():
    read = request.get_json()
    if type(read) == str:
        read = json.loads(read)
    img = read['image']

    img_name = 'image' + str(random.randint(1,1001)) + '.jpg'
    with open(img_name, "wb") as fh:
        fh.write(base64.b64decode(img))

    with open('vqa/tokenizer2.json') as f:
        data = json.load(f)
        vqa_ques_tokenizer = tokenizer_from_json(data)

    vqa_model = tf.keras.models.load_model('vqa/vqa_model.h5')
    vqa_image_model = tf.keras.applications.xception.Xception(weights='imagenet', include_top=False)

    topAnsIndexWord = pickle.load(open('vqa/topAnsIndexWord2.pkl', 'rb'))

    img = image.load_img(img_name, target_size=(299, 299))
    os.remove(img_name)

    # Obtaining features from image using Xception model like the one used in training
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = tf.keras.applications.xception.preprocess_input(x)

    features = vqa_image_model.predict(x)
    X1 = features.reshape((1, 10*10, -1))

    # Cleaning the input question the same way used with the model
    ques = read['question']
    ques = clean_str(ques)
    X2 = vqa_ques_tokenizer.texts_to_sequences([ques])
    X2 = tf.keras.preprocessing.sequence.pad_sequences(X2, padding='post', truncating='post', maxlen=15)

    data = {}
    data['question'] = read['question']

    # Obtaining model prediction then converting it with the index-to-word mapper that was built with the model
    pred = vqa_model.predict([X1, X2])
    pred2 = pred[0].argsort()[-5:][::-1]
    data['answers'] = {}
    txt = ""
    for i in pred2:
        if pred[0][i] > 0.01:
            txt += topAnsIndexWord[i] + " بنسبة " + str(pred[0][i])[:4] + ". \n"
            data['answers'][topAnsIndexWord[i]] = str(pred[0][i])
            break
        else:
            txt = "عفوا، لا يمكنني الإجابة على هذا السؤال"
    data['text'] = txt
    data['sound'] = read_text(txt)
    return jsonify(data)
 def __init__(self, seq_len):
     with open('./pre-trained/tokenizer.json', 'r', encoding='utf-8') as f1:
         tokenizer_config = json.load(f1)
     with open('./pre-trained/label_tokenizer_json.json',
               'r',
               encoding='utf-8') as f2:
         label_tokenizer_config = json.load(f2)
     self.tokenizer = tokenizer_from_json(tokenizer_config)
     self.label_tokenizer = tokenizer_from_json(label_tokenizer_config)
     self.train_sequences = np.load('./pre-trained/train_sequences.npy')
     self.train_label = np.load('./pre-trained/train_label.npy')
     self.test_sequences = np.load('./pre-trained/test_sequences.npy')
     self.test_label = np.load('./pre-trained/test_label.npy')
     self.embeddings_matrix = np.load('./pre-trained/embeddings_matrix.npy')
     self.embedding_dim = 100
     self.word_index = self.tokenizer.word_index
     self.vocab_size = len(self.word_index)
     self.max_len = seq_len
     self.rnn_units = self.embedding_dim
     self.category_num = len(set(self.test_label[:, 0]))
Example #16
0
    def _load_jsons(self):
        print("Loading jsons...")
        loaded = read_json(rootpath + "yval_tokens.json")
        self.y_tokenizer = tokenizer_from_json(loaded)

        raw_word2int = read_json(rootpath + "xval_man_tokens.json")
        self.word2int = ast.literal_eval(raw_word2int)
        # print(self.word2int["_NA"],self.word2int["社保"])
        self.reverse_word_map = dict(
            map(reversed, self.y_tokenizer.word_index.items()))
        print("Done with jsons")
        return
Example #17
0
    def test_tokenizer_serde_no_fitting(self):
        tokenizer = text.Tokenizer(num_words=100)

        tokenizer_json = tokenizer.to_json()
        recovered = text.tokenizer_from_json(tokenizer_json)

        self.assertEqual(tokenizer.get_config(), recovered.get_config())

        self.assertEqual(tokenizer.word_docs, recovered.word_docs)
        self.assertEqual(tokenizer.word_counts, recovered.word_counts)
        self.assertEqual(tokenizer.word_index, recovered.word_index)
        self.assertEqual(tokenizer.index_word, recovered.index_word)
        self.assertEqual(tokenizer.index_docs, recovered.index_docs)
Example #18
0
def load_utils(tokenizer_path, labels_path, index_path):
    with open(tokenizer_path, 'r', encoding='utf-8') as jsonfile:
        tokenizer_data = json.load(jsonfile)

    tokenizer = tokenizer_from_json(tokenizer_data)

    with open(labels_path, 'r', encoding='utf-8') as jsonfile:
        reverse_labels = json.load(jsonfile)

    with open(index_path, 'r') as jsonfile:
        index = json.load(jsonfile)

    return tokenizer, reverse_labels, index
Example #19
0
    def preprocess(self, tokenizer_string=None):
        """
        Preprocess the textual data.

        Returns
        -------
        x_train: The processed-sequenced training data.
        y_train: Processed training labels
        x_val: The processed-sequenced validation data
        y_val: processed validation labels
        word_index: A dictionary containing the word-tokens and their indices for the sequencing.
        """
        if tokenizer_string is None:
            tokenizer = Tokenizer(num_words=self.MAX_NB_WORDS)
            tokenizer.fit_on_texts(self.texts)
            self.tokenizer_string = tokenizer.to_json()
        else:
            self.tokenizer_string = tokenizer_string
            from keras.preprocessing.text import tokenizer_from_json
            tokenizer = tokenizer_from_json(tokenizer_string)
        sequences = tokenizer.texts_to_sequences(self.texts)

        word_index = tokenizer.word_index
        print('Found %s unique tokens.' % len(word_index))

        data = pad_sequences(sequences, maxlen=self.MAX_SEQUENCE_LENGTH)
        labels = to_categorical(np.asarray(self.labels))

        print('Shape of data tensor:', data.shape)
        print('Shape of label tensor:', labels.shape)

        # split the data into a training set and a validation set
        if (self.VALIDATION_SPLIT):
            indices = np.arange(data.shape[0])
            np.random.shuffle(indices)

            data = data[indices]
            labels = labels[indices]
            num_validation_samples = int(self.VALIDATION_SPLIT * data.shape[0])

            x_train = data[:-num_validation_samples]
            y_train = labels[:-num_validation_samples]
            x_val = data[-num_validation_samples:]
            y_val = labels[-num_validation_samples:]
        else:
            x_train = data
            y_train = labels
            x_val = None
            y_val = None

        return x_train, y_train, x_val, y_val, word_index
Example #20
0
def load_tokenizer():
    if os.path.exists(config.TOKENIZER_PATH):
        with open(config.TOKENIZER_PATH) as f:
            return tokenizer_from_json(f.read())

    train_sentences, val_sentences, test_sentences = load_sentences()
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_sentences)
    tokenizer.fit_on_texts(val_sentences)
    tokenizer.fit_on_texts(test_sentences)

    # persistent to file.
    with open(config.TOKENIZER_PATH, 'w') as f:
        f.write(tokenizer.to_json(ensure_ascii=False))

    return tokenizer
Example #21
0
 def predict(self, model):
     text = self.statement_text.toPlainText()
     if text.isspace():
         return
     text = " ".join(self.preprocess().split("\n"))
     model_path = Path(__file__).parent.absolute() / "model"
     with open(model_path / "tokenizer.json", "r") as f:
         tokenizer_json = f.read()
     if not tokenizer_json:
         raise IOError("Cannot read tokenizer")
     tokenizer = tokenizer_from_json(tokenizer_json)
     x = tokenizer.texts_to_matrix([text], mode="binary")
     p = model.predict(x)
     y = argmax(p, axis=-1)
     pred = " ".join([word.capitalize()
                      for word in types[y[0].item()].split("-")])
     prob = f"{round(amax(p, axis=-1)[0].item() * 100, 2)}%"
     self.label_3.setText(f"Prediction: {pred}\nProbability: {prob}")
Example #22
0
def load_tokenizer( texts=None, num_words=MAX_WORDS ):
  file = os.path.join( DATA_HOME, SAVE_DIR, __TOKENIZER_FILE.format( num_words ) )
  # tokenizer config file exists. load it and return tokenizer
  if os.path.exists( file ):
    print( 'loading tokenizer' )
    with open( file, 'r' ) as f:
      return tokenizer_from_json( f.readline() )

  if texts is None:
    texts, _ = load_raw_text()  # load the review data
  tokenizer = Tokenizer( num_words=MAX_WORDS )
  print( 'fitting tokenizer' )
  tokenizer.fit_on_texts( texts )
  json = tokenizer.to_json()
  print( 'saving tokenizer' )
  with open( file, 'w' ) as f:
    f.write( json )

  return tokenizer
    def __init__(self,
                 epochs=5,
                 batch_size=36,
                 max_seq_len=25,
                 fit_verbose=2,
                 print_summary=True,
                 load_model_path=None,
                 tokenizer_path=None):
        self.epochs = epochs
        self.batch_size = batch_size
        self.max_seq_len = max_seq_len
        self.fit_verbose = fit_verbose
        self.print_summary = print_summary
        self.encoder = LabelEncoder()

        if load_model_path:
            self.model = load_model(load_model_path)
            with open(tokenizer_path) as f:
                data = json.load(f)
                self.tokenizer = tokenizer_from_json(data)
        else:
            self.model = self.model_1b
            self.tokenizer = Tokenizer()
    def textPreproc(self, text_in):
        ''' Delete Special Character '''
        print('[model.py] Deleting Special Character..')
        temp_text = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣a-zA-Z ]", "", str(text_in))
        print('[model.py] >>> ', temp_text)
        ''' Tockenization and Delete Stopword '''
        print('[model.py] Tockenization Special Character..')
        okt = Okt()
        okt.morphs
        tocken_text = []
        tocken_text = okt.morphs(temp_text, stem=True)
        print('[model.py] >>> ', tocken_text)
        ''' Delete Stopword '''
        print('[model.py] Deleting Stopword..')
        stopwords = [
            '의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로',
            '자', '에', '와', '한', '하다'
        ]  # Stopword List
        tocken_text = [word for word in tocken_text if not word in stopwords]
        tocken_text = [tocken_text]
        print('[model.py] >>> ', tocken_text)
        ''' Load token '''
        print('[model.py] Loading tockenized data..')
        with open('./../model/tokenizer.json') as f:
            data = json.load(f)
            tokenizer = tokenizer_from_json(data)
        tocken_text = tokenizer.texts_to_sequences(tocken_text)
        print('[model.py] >>> ', tocken_text)
        ''' Array Size Synch '''
        print('[model.py] Syncing array size..')
        max_array_len = 30
        preprocessed_data = pad_sequences(tocken_text, maxlen=max_array_len)
        print('[model.py] >>> ', preprocessed_data)

        print('[model.py] Preprocessing Done!')
        return preprocessed_data
Example #25
0
def WordtoInt(arr: List[str]) -> List[List[int]]:
    with open('/app/ml_controller/tokenizer.json') as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)

    return tokenizer.texts_to_sequences([arr])
Example #26
0
aa_list = aa_list_pre[30001:38001]

# Encrypt DNA, AA sequences into separate 'words' by adding spaces every 3 or 1 characters
aa_spaces = []
for aa_seq in aa_list:
    aa_current = encrypt(aa_seq,1)
    aa_spaces.append(aa_current)
dna_spaces = []
for dna_seq in dna_list:
    dna_current = encrypt(dna_seq,3)
    dna_spaces.append(dna_current)

# Import tokenizers as json (must be same tokenizers from training)
with open('aa_tokenizer.json') as f:
    aa_json = json.load(f)
aa_tokenizer = tokenizer_from_json(aa_json)

with open('dna_tokenizer.json') as f:
    dna_json = json.load(f)
dna_tokenizer = tokenizer_from_json(dna_json)

# Preprocess DNA and AA sequences (tokenize and pad)
preproc_aa, preproc_dna = preprocess(aa_spaces, dna_spaces)

# Ensure correct dimensionality
tmp_x = pad(preproc_aa, preproc_dna.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_dna.shape[-2]))

# Evaluate the test sequences on the trained model
results = model.evaluate(preproc_aa,preproc_dna, batch_size=16)
import pickle,re,json
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import tokenizer_from_json

with open('label_set.pkl', 'rb') as f:
   label_set = pickle.load(f)

with open('tokenizer.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

with open('finalized_model.sav', 'rb') as handle:
    model = pickle.load(handle)

def cleanPunctuation(sentence):  # function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]', r'', sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]', r' ', cleaned)
    cleaned = cleaned.replace("\n", " ")
    return cleaned


def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent
def load_tokenizer():
    with open('Data/tokenized-chars.json') as json_file:
        tokenizer_conf = json.load(json_file)

    tokenizer = tokenizer_from_json(tokenizer_conf)
    return tokenizer
Example #29
0
def load_bow(path):
    with open(path) as f:
        _bow = json.load(f)
    bow = tokenizer_from_json(_bow)

    return bow
x_anomalous_len = len(x_anomalous)
x = x_normal + x_anomalous

no_yy = [0 for i in range(x_normal_len)]
an_yy = [1 for i in range(x_anomalous_len)]
y = no_yy + an_yy

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=21)
print('len x_train: {}, len y_train: {}'.format(len(x_test), len(y_test)))

with open('data/tokenized-chars.json') as json_file:
    tokenizer_conf = json.load(json_file)
tokenizer = tokenizer_from_json(tokenizer_conf)
char_index = tokenizer.word_index

to_predict = x_test

#creating the numerical sequences by mapping the indices to the characters
sequences = tokenizer.texts_to_sequences(to_predict)
char_index = tokenizer.word_index
maxlen = 1000  #length of the longest sequence=input_length
xx = pad_sequences(sequences, maxlen=maxlen)

model = models.load_model('model/lstm-model.h5')
model.load_weights('model/lstm-weights.h5')
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])