def get_tokenizer_with_missing_words(text, tk_defined):
    rm_chars = '!"#$%&()*+,-.:;=?@[\\]^_`{|}~\t\n'

    missing_words = list()
    for word in text.lower().translate(str.maketrans('', '',
                                                     rm_chars)).split():
        if not word in tk_defined.word_index.keys():
            missing_words.append(word)

    # Create tokenizer of missing words
    tk_missing = Tokenizer(filters=rm_chars)
    tk_missing.fit_on_texts(missing_words)

    # Increase all items of a dictionary with a given value
    def increase_key_value(value, word_index):
        word_index.update(
            {key: word_index[key] + value
             for key in word_index.keys()})
        return word_index

    # Merge two dicitonaries together
    def merge_dictionaries(d1, d2):
        return {**d1, **d2}

    tk_missing.word_index = increase_key_value(1379, tk_missing.word_index)

    # Create tokenizer, which is combinaton of tk_defined and tk_missing
    tk = Tokenizer(filters=rm_chars)
    tk.word_index = merge_dictionaries(tk_defined.word_index,
                                       tk_missing.word_index)

    return tk
Beispiel #2
0
    def predict_one_sentence(self, sentence):
        self.__setup_model()

        self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/en_word_index.npy')
        self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/de_word_index.npy')

        en_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_EN'])
        en_tokenizer.word_index = self.en_word_index
        en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3

        de_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_DE'])
        de_tokenizer.word_index = self.de_word_index
        de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3

        print(sentence)
        sentence = en_tokenizer.texts_to_sequences([sentence],
                                                   search_related_word=True)
        print(sentence)
        sentence = pad_sequences(sentence,
                                 maxlen=self.params['MAX_SEQ_LEN'],
                                 padding='post',
                                 truncating='post')
        sentence = sentence.reshape(sentence.shape[0], sentence.shape[1])
        print(sentence)

        prediction = self.M.predict(sentence)

        predicted_sentence = ""
        reverse_word_index = dict(
            (i, word) for word, i in self.de_word_index.items())
        for sentence in prediction:
            for token in sentence:
                max_idx = np.argmax(token)
                if max_idx == 0:
                    print("id of max token = 0")
                    print(
                        "second best prediction is ",
                        reverse_word_index[np.argmax(np.delete(token,
                                                               max_idx))])
                else:
                    next_word = reverse_word_index[max_idx]
                    if next_word == self.END_TOKEN:
                        break
                    elif next_word == self.START_TOKEN:
                        continue
                    predicted_sentence += next_word + " "

        return predicted_sentence
Beispiel #3
0
def shuffleData(datas, labels, tokenflag=True):
    if (tokenflag):
        tokenizer = Tokenizer(num_words=num_words + 1, oov_token='UNK')
        tokenizer.fit_on_texts(datas)
        tokenizer.word_index = {
            e: i
            for e, i in tokenizer.word_index.items() if i <= num_words
        }
        tokenizer.word_index[tokenizer.oov_token] = 1
        print(len(tokenizer.word_index))
        joblib.dump(tokenizer, 'dataFile.pkl')

    tokenizer = joblib.load('dataFile.pkl')
    all_text_seq = tokenizer.texts_to_sequences(datas)
    all_text_test = pad_sequences(all_text_seq,
                                  maxlen=sequence_length,
                                  padding='pre',
                                  value=0)
    np.random.seed(100)
    shuffle_indices = np.random.permutation(np.arange(len(labels)))
    x_shuffled = np.array(all_text_test)[shuffle_indices.astype(int)]
    y_shuffled = np.array(labels)[shuffle_indices.astype(int)]

    # Split train/test set
    dev_sample_index = -1 * int(dev_sample_percentage * len(labels))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]
    del datas, labels, x_shuffled, y_shuffled, all_text_test
    return x_train, x_dev, y_train, y_dev, tokenizer
Beispiel #4
0
def get_multilabel_train_data(input_length, path="train_data_after_cut.xlsx"):
    df = pd.read_excel(path, encoding="utf-8")
    content = df["content"]
    filters = '!?"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n\r!@#¥%…&*():“”’‘;《》?,。'
    tokenizer = Tokenizer(filters=filters,
                          lower=True,
                          split=" ",
                          oov_token="UNK")
    if os.path.exists("vocab.json"):
        with open("vocab.json", encoding="utf-8") as f:

            vocab = json.load(f)
            tokenizer.word_index = vocab
    else:
        tokenizer.fit_on_texts(content)
        vocab = tokenizer.word_index
        with open("vocab.json", encoding="utf-8", mode="w") as f:

            json.dump(vocab, f)

    content_list_seq = tokenizer.texts_to_sequences(content)
    # print(sum([len(c) for c in content_list_seq])/len(content_list_seq))
    content_list_seq_pad = pad_sequences(content_list_seq, maxlen=input_length)

    return df, content_list_seq_pad, 4, len(vocab)
Beispiel #5
0
def main():

    ### read training and testing data
    tag_list = pickle.load(open("label_mapping.p", "rb"))
    (_, X_test, _) = read_data(test_path, False)

    ### tokenizer for all data
    tokenizer = Tokenizer()
    word_index = pickle.load(open("word_index.p", "rb"))
    tokenizer.word_index = word_index
    ### convert word sequences to index sequence
    test_sequences = tokenizer.texts_to_sequences(X_test)

    ### padding to equal length
    test_sequences = pad_sequences(test_sequences, maxlen=306)

    ### split data into training set and validation set

    model = load_model('best_model.hdf5',
                       custom_objects={'f1_score': f1_score})
    Y_pred = model.predict(test_sequences)
    thresh = 0.4
    with open(output_path, 'w') as output:
        print('\"id\",\"tags\"', file=output)
        Y_pred_thresh = (Y_pred > thresh).astype('int')
        for index, labels in enumerate(Y_pred_thresh):
            labels = [
                tag_list[i] for i, value in enumerate(labels) if value == 1
            ]
            labels_original = ' '.join(labels)
            print('\"%d\",\"%s\"' % (index, labels_original), file=output)
    def calculate_hiddenstate_after_encoder(self, sentence):
        self.__setup_model()

        tokenizer = Tokenizer()
        self.word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/word_index.npy')
        self.word_index = self.word_index.item()
        tokenizer.word_index = self.word_index
        self.num_words = self.params['MAX_WORDS'] + 3
        tokenizer.num_words = self.num_words

        try:
            self.word_index[self.START_TOKEN]
            self.word_index[self.END_TOKEN]
            self.word_index[self.UNK_TOKEN]
        except Exception as e:
            print(e, "why")
            exit()

        sentence = tokenizer.texts_to_sequences([sentence])
        sentence = [self.word_index[self.START_TOKEN]] + sentence[0] + [self.word_index[self.END_TOKEN]]
        sentence = pad_sequences([sentence], maxlen=self.params['max_seq_length'], padding='post')
        sentence = sentence.reshape(sentence.shape[0], sentence.shape[1])

        encoder_name = 'encoder'

        encoder = Model(inputs=self.M.input, outputs=self.M.get_layer(encoder_name).output)

        prediction = encoder.predict(sentence, batch_size=1)
        print(prediction.shape)
        return prediction
    def tokenize_texts(self, corpus=None):
        print('[Preprocess] tokenize texts', flush=True)

        if corpus is None: corpus = self.corpus

        filters = '!"$%&()*+,-./:;<=>?@[\]^_`{|}~'
        tokenizer = Tokenizer(filters=filters)
        wi_path = self.model_dir + 'word_index.json'

        if not osp.exists(wi_path):
            print('[Preprocess] construct word index', flush=True)
            tokenizer.fit_on_texts(corpus)
            word_index = tokenizer.word_index
            with open(wi_path, 'w') as f:
                print('[Preprocess] save word index: ' + wi_path, flush=True)
                json.dump(word_index, f)
        else:
            with open(wi_path, 'r') as f:
                print('[Preprocess] load word index: ' + wi_path, flush=True)
                word_index = json.load(f)
            tokenizer.word_index = word_index

        train_Xi = tokenizer.texts_to_sequences(self.train_X)
        test_Xi = tokenizer.texts_to_sequences(self.test_X)

        self.train_Xi = pad_sequences(train_Xi)
        self.maxlen = self.train_Xi.shape[1]
        self.test_Xi = pad_sequences(test_Xi, maxlen=self.maxlen)

        self.word_index = word_index
Beispiel #8
0
def main():

    print('==================================================================')
    print('Read test data and categories.')
    test_text = read_test(argv[1])
    categories = np.load('categories.npy')

    print('==================================================================')
    print('Load tokenizer.')
    tokenizer = Tokenizer()
    tokenizer.word_index = np.load(argv[3][:-3] + '_word_index.npy').item()
    test_sequences = tokenizer.texts_to_sequences(test_text)
    test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LEN)
    print('Shape of test data:', test_data.shape)

    print('==================================================================')
    print('Load model.')
    model = load_model(argv[3], custom_objects={'f1_score': f1_score})
    model.summary()

    print('Predict.')
    result = model.predict(test_data, verbose=1)

    print('==================================================================')
    print('Output result. threshold: %f' % THRESHOLD)
    output_result(argv[2], result, categories)
def tokenize_sequence(sentences, filters, max_num_words, word_index):
    """
    Tokenizes a given input sequence of words.
    Args:
        sentences: List of sentences
        filters: List of filters/punctuations to omit (for Keras tokenizer)
        max_num_words: Number of words to be considered in the fixed length sequence
        max_vocab_size: Number of most frequently occurring words to be kept in the vocabulary
    Returns:
        x : List of padded/truncated indices created from list of sentences
        word_index: dictionary storing the word-to-index correspondence
    """

    sentences = [' '.join(word_tokenize(s)[:max_num_words]) for s in sentences]

    tokenizer = Tokenizer(filters=filters, oov_token = True)
    tokenizer.word_index = word_index
    x = tokenizer.texts_to_sequences(list(sentences))

    for i, seq in enumerate(x):
        if any(t == None for t in seq):
            seq = [t if t != None else word_index['UNK'] for t in seq]
        seq.append(word_index['EOS'])
        x[i] = seq

    x = pad_sequences(x, padding='post', truncating='post', maxlen=max_num_words, value=word_index['PAD'])

    return x
def encode_textdata(df_X_text, tokenizer, mode, max_words, maxlen):
    ## encode text columns, encoded text features should not be normalized.

    print('Starting to encode text inputs...')

    texts = df_X_text.iloc[:,0].values.astype('U')
    print('Found %s texts.' % len(texts))

    if mode == 'tfidf':
        if tokenizer is None:
            tokenizer = Tokenizer(num_words=max_words)
            tokenizer.fit_on_texts(texts)
        X_text = tokenizer.texts_to_matrix(texts, mode='tfidf')
        print('tfidf X_text shape: {}'.format(X_text.shape))

    elif mode == 'glove':
        # vectorize the text samples into a 2D integer tensor
        if tokenizer is None:
            tokenizer = Tokenizer(num_words=max_words, oov_token='<UNK>')
            tokenizer.fit_on_texts(texts)
            tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= max_words}
            # tokenizer.word_index[tokenizer.oov_token] = max_words + 1

        sequences = tokenizer.texts_to_sequences(texts)
        
        word_index = tokenizer.word_index
        print('Found %s unique tokens.' % len(word_index))

        X_text = pad_sequences(sequences, maxlen=maxlen, padding='post')
    else:
        raise ValueError('Unknown text processing mode: {}'.format(mode))
                
    return X_text, tokenizer  ### need to save embedding_matrix as well
Beispiel #11
0
def main():
    config = Config()

    char_pred_test_start, char_pred_test_end = create_input_data(config.models)

    df_test = pd.read_csv(
        '../input/tweet-sentiment-extraction/test.csv').fillna('')
    df_test['selected_text'] = ''

    tokenizer = Tokenizer(num_words=None,
                          char_level=True,
                          oov_token='UNK',
                          lower=True)
    tokenizer.word_index = VOCAB

    len_voc = len(tokenizer.word_index) + 1

    X_test = tokenizer.texts_to_sequences(df_test['text'].values)

    test_dataset = TweetCharDataset(df_test,
                                    X_test,
                                    char_pred_test_start,
                                    char_pred_test_end,
                                    max_len=config.max_len_val,
                                    train=False,
                                    n_models=config.n_models)

    pred_tests = k_fold_inference(config, test_dataset, len_voc, seed=42)

    np.save(f"preds_char_test_{config.model_name}.npy", np.array(pred_tests))
Beispiel #12
0
def predict(corpus_path, model_dir, embeddings):

  with open('word_index.pickle', 'rb') as fin:
    word_index = pickle.load(fin)
  with open('embedding_matrix.pickle', 'rb') as fin:
    embedding_matrix = pickle.load(fin)

  tweets, labels = get_data(corpus_path)

  tokenizer = Tokenizer()
  tokenizer.word_index = word_index
  sequences = tokenizer.texts_to_sequences(tweets)
  sequences = pad_sequences(sequences, maxlen=50)

  adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=10e-9, decay=0.0, amsgrad=False)
  model = get_model(model_dir, corpus_path, embedding_matrix)
  corpus_file = os.path.basename(corpus_path)

  weights_paths = os.listdir(model_dir)
  search = re.sub('2018-', '', corpus_file)
  search = re.sub('.txt', '', search)
  search = re.sub('-dev', '', search)
  #print(search)
  weights_file = [match for match in weights_paths if search in match][0]
  weights_path = os.path.join(model_dir, weights_file)
  #weights_path = os.path.join(model_dir, corpus_file.split('.')[0] + '.hdf5')
  model.load_weights(weights_path)
  model.compile(loss='mean_squared_error', optimizer=adam)

  predictions = model.predict(sequences)
  #print(pearsonr(labels.reshape(-1, 1), predictions))
  return predictions
Beispiel #13
0
def dataset_preparation(data, num_words=None):
    """Prep the corpus text for training.
    Expect end tokens, start tokens, return tokens to already have been added.
    data--corpus of text
    num_words--max number of words for model to have.
    """
    tokenizer = Tokenizer()
    #Want to have system have way to end a poem. So adding another end token.
    corpus = data.lower().replace('<endtoken>',
                                  '<endtoken2><endtoken>').split("<endtoken>")
    tokenizer.fit_on_texts(corpus)
    ###Generate list of words:
    #If max number of words given, find and use just those words:
    if num_words != None:
        tokenizer.word_index = {
            e: i
            for e, i in tokenizer.word_index.items() if i <= num_words
        }
        #make sure to still have out of vocabulary token:
        tokenizer.word_index[tokenizer.oov_token] = num_words + 1
    total_words = len(tokenizer.word_index) + 1
    ###Generate list of input sequences
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i + 1]
            input_sequences.append(n_gram_sequence)
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(
        pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
    label = ku.to_categorical(label, num_classes=total_words)
    return tokenizer, predictors, label, max_sequence_len, total_words
Beispiel #14
0
    def __prepareFeatures(self, dataset: str, importIndexes=False):
        sources, languages = self.extractSources(dataset)

        # configs
        max_features: int = self.config['max_features']
        max_len_sequences: int = self.config['max_len_sequences']

        wordsIndexes: dict = {}
        tokenizer = Tokenizer(num_words=max_features,
                              filters=TOKENIZER_CONFIG['filter'],
                              oov_token='UNKNOWN')

        # tokenization
        if not importIndexes:
            tokenizer.fit_on_texts(sources)
            # export vocabulary
            self.exportVocabulary(tokenizer.word_index)
        else:
            # import vocabulary
            tokenizer.word_index = self.importVocabulary()

        # X + Y
        X = tokenizer.texts_to_sequences(sources)
        X = pad_sequences(X, maxlen=max_len_sequences)
        Y = languages

        return np.array(X), np.array(Y)
def main():

    print('==================================================================')    
    print('Read test data and categories.')
    test_text = read_test(argv[1])
    categories = np.load('categories.npy')

    print('==================================================================')    
    print('Predict')
    model_list = read_model_list('model_list.txt')
    nb_models = len(model_list)
    print('Total models: %d' % nb_models)

    sum_result = np.zeros([len(test_text), len(categories)])
    sum_weight = 0
    for (weight, name) in model_list:

        print('model: ' + name + ', weight: %f' % weight)
        tokenizer = Tokenizer()
        tokenizer.word_index = np.load(name[:-3] + '_word_index.npy').item()
        test_sequences = tokenizer.texts_to_sequences(test_text)
        test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LEN)
    
        model = load_model(name, custom_objects={'f1_score': f1_score})
        sum_result += model.predict(test_data, verbose=0)
        sum_weight += weight
    
    sum_result /= sum_weight

    print('==================================================================')    
    print('Output result. threshold: %f' % THRESHOLD)
    output_result(argv[2], sum_result, categories, THRESHOLD)
def get_dict(sentences, filters, max_num_words, max_vocab_size):
    sentences = [' '.join(word_tokenize(s)[:max_num_words]) for s in sentences]

    tokenizer = Tokenizer(filters=filters)
    tokenizer.fit_on_texts(sentences)

    word_index = dict()
    word_index['PAD'] = 0
    word_index['UNK'] = 1
    word_index['GO'] = 2
    word_index['EOS'] = 3

    for i, word in enumerate(dict(tokenizer.word_index).keys()):
        word_index[word] = i + 4

    tokenizer.word_index = word_index
    x = tokenizer.texts_to_sequences(list(sentences))

    for i, seq in enumerate(x):
        if any(t >= max_vocab_size for t in seq):
            seq = [t if t < max_vocab_size else word_index['UNK'] for t in seq]
        seq.append(word_index['EOS'])
        x[i] = seq

    x = pad_sequences(x, padding='post', truncating='post', maxlen=max_num_words, value=word_index['PAD'])

    word_index = {k: v for k, v in word_index.items() if v < max_vocab_size}

    return word_index
Beispiel #17
0
def tokenizer_from_json(json_string):
    """Parses a JSON tokenizer configuration file and returns a
    tokenizer instance.
    # Arguments
        json_string: JSON string encoding a tokenizer configuration.
    # Returns
        A Keras Tokenizer instance
    """
    tokenizer_config = json.loads(json_string)
    config = tokenizer_config.get('config')

    word_counts = json.loads(config.pop('word_counts'))
    word_docs = json.loads(config.pop('word_docs'))
    index_docs = json.loads(config.pop('index_docs'))
    # Integer indexing gets converted to strings with json.dumps()
    index_docs = {int(k): v for k, v in index_docs.items()}
    index_word = json.loads(config.pop('index_word'))
    index_word = {int(k): v for k, v in index_word.items()}
    word_index = json.loads(config.pop('word_index'))

    tokenizer = Tokenizer(**config)
    tokenizer.word_counts = word_counts
    tokenizer.word_docs = word_docs
    tokenizer.index_docs = index_docs
    tokenizer.word_index = word_index
    tokenizer.index_word = index_word

    return tokenizer
Beispiel #18
0
def main():
    (_, X_test, _) = read_data(sys.argv[1], False)

    tokenizer = Tokenizer()
    tokenizer.word_index = pickle.load(open('bow_word_index.pickle', 'rb'))

    test_bag = tokenizer.texts_to_matrix(X_test, 'count')

    model = Sequential()

    model.add(Dense(512, activation='relu', input_dim=51867))
    model.add(Dropout(0.5))

    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.6))

    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.8))

    model.add(Dense(38, activation='sigmoid'))

    model.load_weights('bow.hdf5')
    Y_pred = model.predict(test_bag)
    thresh = 0.4
    tag_list = pickle.load(open('label_mapping.pickle', 'rb'))
    with open(sys.argv[2], 'w') as output:
        print('\"id\",\"tags\"', file=output)
        Y_pred_thresh = (Y_pred > thresh).astype('int')
        for index, labels in enumerate(Y_pred_thresh):
            labels = [
                tag_list[i] for i, value in enumerate(labels) if value == 1
            ]
            labels_original = ' '.join(labels)
            print('\"%d\",\"%s\"' % (index, labels_original), file=output)
def generate_sequences_from_texts(texts,
                                  indices_list,
                                  textgen,
                                  batch_size=128):
    """
    Generates sequences from the given texts based on the selected configuration
    """
    is_words = textgen.config['word_level']
    is_single = textgen.config['single_text']
    max_length = textgen.config['max_length']
    meta_token = textgen.META_TOKEN

    if is_words:
        new_tokenizer = Tokenizer(filters='', char_level=True)
        new_tokenizer.word_index = textgen.vocab
    else:
        new_tokenizer = textgen.tokenizer

    while True:
        np.random.shuffle(indices_list)

        X_batch = []
        Y_batch = []
        count_batch = 0

        for row in range(indices_list.shape[0]):
            text_index = indices_list[row, 0]
            end_index = indices_list[row, 1]

            text = texts[text_index]

            if not is_single:
                text = [meta_token] + list(text) + [meta_token]

            if end_index > max_length:
                x = text[end_index - max_length:end_index + 1]
            else:
                x = text[0:end_index + 1]

            y = text[end_index + 1]

            if y in textgen.vocab:
                x = process_sequence([x], textgen, new_tokenizer)
                y = text_generation_encode_cat([y], textgen.vocab)

                X_batch.append(x)
                Y_batch.append(y)

                count_batch += 1

                if count_batch % batch_size == 0:
                    X_batch = np.squeeze(np.array(X_batch))
                    Y_batch = np.squeeze(np.array(Y_batch))

                    yield (X_batch, Y_batch)

                    X_batch = []
                    Y_batch = []
                    count_batch = 0
    def predict_batch(self, sentences):
        self.__setup_model()

        tokenizer = Tokenizer()
        self.word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/word_index.npy')
        self.word_index = self.word_index.item()
        tokenizer.word_index = self.word_index
        self.num_words = self.params['MAX_WORDS'] + 3
        tokenizer.num_words = self.num_words

        try:
            self.word_index[self.START_TOKEN]
            self.word_index[self.END_TOKEN]
            self.word_index[self.UNK_TOKEN]
        except Exception as e:
            print(e, "why")
            exit()

        sentences = tokenizer.texts_to_sequences(sentences)
        mod_sentences = []
        for sentence in sentences:
            mod_sentences.append([self.word_index[self.START_TOKEN]] + sentence + [self.word_index[self.END_TOKEN]])
        sentences = pad_sequences(mod_sentences, maxlen=self.params['max_seq_length'], padding='post')
        sentences = sentences.reshape(sentences.shape[0], sentences.shape[1])

        batch_size = sentences.shape[0]
        if batch_size > 10:
            batch_size = 10

        reverse_word_index = dict((i, word) for word, i in self.word_index.items())
        predicted_sentences = []
        from_idx = 0
        to_idx = batch_size
        while True:
            print("from_idx, to_idx, hm_sentences", from_idx, to_idx, sentences.shape[0])
            current_batch = sentences[from_idx:to_idx]
            prediction = self.M.predict(current_batch, batch_size=batch_size)

            for sentence in prediction:
                predicted_sent = ""
                for token in sentence:
                    max_idx = np.argmax(token)
                    if max_idx == 0:
                        print("id of max token = 0")
                        print("second best prediction is ", reverse_word_index[np.argmax(np.delete(token, max_idx))])
                    else:
                        next_word = reverse_word_index[max_idx]
                        if next_word == self.END_TOKEN:
                            break
                        elif next_word == self.START_TOKEN:
                            continue
                        predicted_sent += next_word + " "
                predicted_sentences.append(predicted_sent)
            from_idx += batch_size
            to_idx += batch_size
            if to_idx > sentences.shape[0]:
                # todo nicht multiple von batchsize trotzdem predicten
                break
        return predicted_sentences
Beispiel #21
0
def get_padded_dataset(dataset):
    labels = [x['label'] for x in dataset]
    data = [x['sentence'] for x in dataset]
    # Preprocessing text
    tokenizer = Tokenizer()
    tokenizer.word_index = word_index
    data_seqs = tokenizer.texts_to_sequences(data)
    data_seqs_padded = pad_sequences(data_seqs, maxlen=MAX_SEQUENCE_LENGTH)
    labels = np.array(labels)
    return (data_seqs_padded, labels)
def prepare_tokenizer():
    tk = Tokenizer(char_level=True)

    alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    char_dict = {}
    for i, char in enumerate(alphabet):
        char_dict[char] = i + 1
    tk.word_index = char_dict.copy()
    tk.word_index[tk.oov_token] = max(char_dict.values()) + 1

    return tk
Beispiel #23
0
def tokenize(sentence, dictionary):
    num_words = len(dictionary)
    tokenizer = Tokenizer(num_words,
                          filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ 1234567890')
    tokenizer.word_index = dictionary
    max_words = 35
    # sentence_vec = text_to_word_sequence(sentence)
    sentence_vec = tokenizer.texts_to_sequences([sentence])
    print(sentence_vec)
    sentence_vec = sequence.pad_sequences(sentence_vec, maxlen=max_words)
    return sentence_vec
def tokenize(dic, data):
    # create a tokenizer and feed in word index
    t = Tokenizer(num_words=None, lower=True, split=' ')
    t.word_index = dic
    # convert words from each call transcription into an index array
    allWords = []
    transcriptions = data['Words']
    for text in transcriptions:
        words = convert_text_to_index_array(text, dic)
        allWords.append(words)
    # convert index array into a matrix and return it
    return t.sequences_to_matrix(allWords, mode='binary')
Beispiel #25
0
    def calculate_hiddenstate_after_encoder(self, sentence):
        self.__setup_model()

        self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/en_word_index.npy')
        self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/de_word_index.npy')

        en_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_EN'])
        en_tokenizer.word_index = self.en_word_index
        en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3

        de_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_DE'])
        de_tokenizer.word_index = self.de_word_index
        de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3

        print(sentence)
        sentence = en_tokenizer.texts_to_sequences([sentence])
        print(sentence)
        sentence = pad_sequences(sentence,
                                 maxlen=self.params['MAX_SEQ_LEN'],
                                 padding='post',
                                 truncating='post')
        sentence = sentence.reshape(sentence.shape[0], sentence.shape[1])
        print(sentence)

        encoder_name = 'encoder'

        encoder = Model(inputs=self.M.input,
                        outputs=self.M.get_layer(encoder_name).output)

        prediction = encoder.predict(sentence, batch_size=1)
        print(prediction.shape)
        return prediction
Beispiel #26
0
def train(corpus_file,
          model_dir,
          embeddings,
          affect_lexicon,
          fresh_run=False,
          data_dir=None,
          epochs=3,
          batch_size=64,
          val_split=0.2):

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    if fresh_run:
        word_index = prepare_word_index(data_dir)
        embedding_index = prepare_embedding_index(embeddings)
        embedding_matrix = prepare_embedding_matrix(word_index,
                                                    embedding_index)
        affect_index = prepare_affect_index(affect_lexicon)
        affect_matrix = prepare_affect_matrix(word_index, affect_index)
    else:
        with open('word_index.pickle', 'rb') as fin:
            word_index = pickle.load(fin)
        with open('embedding_index.pickle', 'rb') as fin:
            embedding_index = pickle.load(fin)
        with open('embedding_matrix.pickle', 'rb') as fin:
            embedding_matrix = pickle.load(fin)
        with open('affect_index.pickle', 'rb') as fin:
            affect_index = pickle.load(fin)
        with open('affect_matrix.pickle', 'rb') as fin:
            affect_matrix = pickle.load(fin)

    tweets, labels = get_data(corpus_file)
    print(tweets[:5])
    print(labels[:5])

    tokenizer = Tokenizer()
    tokenizer.word_index = word_index
    sequences = tokenizer.texts_to_sequences(tweets)
    sequences = pad_sequences(sequences, maxlen=50)

    x_train, x_val, y_train, y_val = train_test_split(sequences,
                                                      labels,
                                                      test_size=val_split,
                                                      random_state=42)

    scores, models = grid_search(x_train, y_train, x_val, y_val, architectures,
                                 param_grid, 5, corpus_file, embedding_matrix,
                                 model_dir)

    with open('history.pickle', 'wb') as fout:
        pickle.dump((scores, models), fout)
def kerasTokenizer(balanced_texts, max_sentence_length, topbestwords):
    global vector_dim
    vector_dim = max_sentence_length
    global top_words
    top_words = topbestwords
    tokenizer = Tokenizer(num_words=topbestwords)
    tokenizer.fit_on_texts(balanced_texts)
    sequences = tokenizer.texts_to_sequences(balanced_texts)
    data = pad_sequences(sequences, maxlen=max_sentence_length, padding='pre')
    # print(data[:2])
    tokenizer.word_index = OrderedDict(
        sorted(tokenizer.word_index.items(), key=lambda t: t[1]))
    return data, tokenizer.word_index
Beispiel #28
0
def main():

    ### read training and testing data
    tag_list = pickle.load(open(tags_path, 'rb'))
    (_, X_test,_) = read_data(test_path,False)
    all_corpus = pickle.load(open(corpus_path, 'rb'))
    print ('Find %d articles.' %(len(all_corpus)))
    ### tokenizer for all data
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_corpus)
    word_index = pickle.load(open(wIndex_path, 'rb'))
    tokenizer.word_index = word_index

    ### convert word sequences to index sequence
    print ('Convert to index sequences.')    
    #train_sequences = tokenizer.texts_to_matrix(X_data, mode = 'tfidf')
    test_sequences = tokenizer.texts_to_matrix(X_test, mode = 'tfidf')

    ### padding to equal length
    print ('Padding sequences.')
    #train_sequences = pad_sequences(train_sequences)
    #max_article_length = train_sequences.shape[1]
    test_sequences = pad_sequences(test_sequences,maxlen=51867)

    ### build model
    print ('Building model.')
    model = Sequential()
    model.add(Dense(input_dim=51867, units=480,activation='relu'))
    model.add(Dropout(0.15))
    model.add(Dense(512,activation='relu'))
    model.add(Dropout(0.15))
    model.add(Dense(512,activation='relu'))
    model.add(Dropout(0.15))
    model.add(Dense(512,activation='relu'))
    model.add(Dropout(0.15))
    model.add(Dense(512,activation='relu'))
    model.add(Dropout(0.15))
    model.add(Dense(38,activation='sigmoid'))
    model.summary()


    model.load_weights(weight_path)
    Y_pred = model.predict(test_sequences)
    thresh = threshold
    with open(output_path,'w') as output:
        print ('\"id\",\"tags\"',file=output)
        Y_pred_thresh = (Y_pred > thresh).astype('int')
        for index,labels in enumerate(Y_pred_thresh):
            labels = [tag_list[i] for i,value in enumerate(labels) if value==1 ]
            labels_original = ' '.join(labels)
            print ('\"%d\",\"%s\"'%(index,labels_original),file=output)
Beispiel #29
0
def load_tokenizer_from_file(filename):

    tokenizer = Tokenizer()

    with open(filename, 'r') as infile:
        tokenizer_data = json.load(infile)

    tokenizer.word_counts = OrderedDict(tokenizer_data['word_counts'])
    tokenizer.word_docs = tokenizer_data['word_docs']
    tokenizer.word_index = tokenizer_data['word_index']
    tokenizer.document_count = tokenizer_data['document_count']
    tokenizer.index_docs = tokenizer_data['index_docs']

    return tokenizer
Beispiel #30
0
def structure_data(path='agnews_data'):

    texts, labels = create_dataset(path)
    tok = Tokenizer(char_level=True, split='')
    tok.fit_on_texts(texts)
    tok.word_index = char_index
    sequences = tok.texts_to_sequences(texts)
    padding = pad_sequences(sequences, maxlen=1014, padding='post')
    padding = np.array(padding)
    labels = to_categorical(labels)

    print('Annotations done and Data is ready to be fed to the network')

    return padding, labels
Beispiel #31
0
def generate_sequences_from_texts(texts, indices_list,
                                  textgenrnn, context_labels,
                                  batch_size=128):
    is_words = textgenrnn.config['word_level']
    is_single = textgenrnn.config['single_text']
    max_length = textgenrnn.config['max_length']
    meta_token = textgenrnn.META_TOKEN

    if is_words:
        new_tokenizer = Tokenizer(filters='', char_level=True)
        new_tokenizer.word_index = textgenrnn.vocab
    else:
        new_tokenizer = textgenrnn.tokenizer

    while True:
        np.random.shuffle(indices_list)

        X_batch = []
        Y_batch = []
        context_batch = []
        count_batch = 0

        for row in range(indices_list.shape[0]):
            text_index = indices_list[row, 0]
            end_index = indices_list[row, 1]

            text = texts[text_index]

            if not is_single:
                text = [meta_token] + list(text) + [meta_token]

            if end_index > max_length:
                x = text[end_index - max_length: end_index + 1]
            else:
                x = text[0: end_index + 1]
            y = text[end_index + 1]

            if y in textgenrnn.vocab:
                x = process_sequence([x], textgenrnn, new_tokenizer)
                y = textgenrnn_encode_cat([y], textgenrnn.vocab)

                X_batch.append(x)
                Y_batch.append(y)

                if context_labels is not None:
                    context_batch.append(context_labels[text_index])

                count_batch += 1

                if count_batch % batch_size == 0:
                    X_batch = np.squeeze(np.array(X_batch))
                    Y_batch = np.squeeze(np.array(Y_batch))
                    context_batch = np.squeeze(np.array(context_batch))

                    # print(X_batch.shape)

                    if context_labels is not None:
                        yield ([X_batch, context_batch], [Y_batch, Y_batch])
                    else:
                        yield (X_batch, Y_batch)
                    X_batch = []
                    Y_batch = []
                    context_batch = []
                    count_batch = 0