def test_one_hot(): sample_text = 'The cat sat on the mat.' encoded = text.one_hot(sample_text, 5) assert len(encoded) == 6 assert np.max(encoded) <= 4 assert np.min(encoded) >= 0 sample_text = 'The-cat-sat-on-the-mat' encoded2 = text.one_hot(sample_text, 5, analyzer=lambda t: t.lower().split('-')) assert encoded == encoded2 assert len(encoded) == 6 assert np.max(encoded) <= 4 assert np.min(encoded) >= 0
def fetch(): i = 0 content = '' list_of_reviews = [] conn = create_connection("preprocessed_data") c = conn.cursor() data = c.execute("""SELECT review FROM movie_reviews LIMIT 5""") for review in data: content = content + ' ' + review[0] list_of_reviews.append(review[0]) i += 1 if i % 250 == 0: print("%s reviews have been fetched." % (i)) tk = one_hot(content, split=" ", n=100000) print(tk) print("####################") vectorizer = CountVectorizer(min_df=0) vectorizer.fit(list_of_reviews) print(vectorizer.vocabulary_) print(vectorizer.transform(list_of_reviews).toarray()) content = content.split() dictionary = set(content) conn.close() create_dictionary_database() connection = create_connection("dictionary") cur = connection.cursor() i = 0 for word in dictionary: cur.execute("""INSERT INTO dictionary (word) VALUES (?)""", (word, )) i += 1 if i % 500 == 0: print("%s words are in dictionary." % (i)) connection.commit() connection.close()
def one_hot(input_text, n, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' '): r"""One-hot encodes a text into a list of word indexes of size `n`. This function receives as input a string of text and returns a list of encoded integers each corresponding to a word (or token) in the given input string. Arguments: input_text: Input text (string). n: int. Size of vocabulary. filters: list (or concatenation) of characters to filter out, such as punctuation. Default: ``` '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n ```, includes basic punctuation, tabs, and newlines. lower: boolean. Whether to set the text to lowercase. split: str. Separator for word splitting. Returns: List of integers in `[1, n]`. Each integer encodes a word (unicity non-guaranteed). """ return text.one_hot(input_text, n, filters=filters, lower=lower, split=split)
def prep_1(text): text = "The quick brown fox jumped over the lazy dog." list_unique_words = list(set(text_to_word_sequence(text))) print(f"docs: {list_unique_words[:100]}") vocab_size = len(list_unique_words) print(f"vocab_size: {vocab_size}") oh_encoding = one_hot(text, n=round(vocab_size * 1.3)) print(f"oh_encoding: {oh_encoding}") hashed_doc = hashing_trick(text, n=round(vocab_size * 1.3), hash_function='md5') print(f"hashed_doc: {hashed_doc}") return oh_encoding
def prueba_1(): docs = ['Well done!', 'Good work', 'Great effort', 'nice work', 'Excellent!', 'Weak', 'Poor effort!', 'not good', 'poor work', 'Could have done better.'] # define class labels labels = np.array([1,1,1,1,1,0,0,0,0,0]) # integer encode the documents vocab_size = 50 encoded_docs = [one_hot(d, vocab_size) for d in docs] print(encoded_docs) # pad documents to a max length of 4 words max_length = 4 padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post') print(padded_docs) # define the model model = Sequential() model.add(Embedding(vocab_size, 8, input_length=max_length)) model.add(Flatten()) model.add(Dense(1, activation='sigmoid')) # compile the model model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # summarize the model print(model.summary()) # fit the model model.fit(padded_docs, labels, epochs=50, verbose=0) # evaluate the model loss, accuracy = model.evaluate(padded_docs, labels, verbose=0) print('Accuracy: %f' % (accuracy*100))
url = url[0:siz] # no of dots in url def label_encode(label): enclabel = [] for i in label: if i == "bad": enclabel.append(0) else: enclabel.append(1) return enclabel # one hot encode encoded_docs = [one_hot(d, 20 * len(url)) for d in url] leng = [] for i in encoded_docs: leng.append(len(i)) print(max(leng)) padded_docs = pad_sequences(encoded_docs, maxlen=max(leng), padding='post') label = label_encode(label1) label = label[0:siz] la = label label = np_utils.to_categorical(label, 2) model = Sequential() model.add(Embedding(siz, 32, input_length=max(leng))) input_array = padded_docs model.compile('rmsprop', 'mse')
# print(one_hot_labels) # quit() CUSTOM_FILTERS = [ lambda x: x.lower(), strip_multiple_whitespaces, strip_punctuation, remove_stopwords, stem_text ] ppdocs = list() for doc in _docs: word_list = preprocess_string(doc, CUSTOM_FILTERS) ppdocs.append(' '.join(word_list)) vocab_size = 4000 max_length = 30 encoded_docs = [one_hot(d, vocab_size) for d in ppdocs] padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post') # print(padded_docs[0]) # X_train = np.array(encoded_docs) # print(encoded_docs) # quit() # now develop the model input_dim = max_length model = Sequential() # model.add(Dense(512, input_shape=(input_dim,))) model.add(Embedding(vocab_size, 100, input_length=max_length)) model.add(Dropout(rate=0.5)) model.add(Flatten()) # model.add(LSTM(128)) model.add(Activation("relu"))
def test_one_hot(): sample_text = 'The cat sat on the mat.' encoded = text.one_hot(sample_text, 5) assert len(encoded) == 6 assert np.max(encoded) <= 4 assert np.min(encoded) >= 0
def get_padded_sentences(data_set_to_pad, vocab_size, longest_sentence): sentences = join_tokens(data_set_to_pad) encoded_sentences = [one_hot(sentence, vocab_size) for sentence in sentences] padded_sentences = pad_sequences(encoded_sentences, maxlen=longest_sentence, padding='post') return padded_sentences
from keras_preprocessing.text import text_to_word_sequence from keras_preprocessing.text import Tokenizer from keras_preprocessing.text import one_hot text = "Hei, dette er noe testtext" tronder_file = open("TextInput/rawText.txt", "r", encoding="utf-8") tronder_text = tronder_file.read() tronder_file.close() one_hot_result = one_hot(tronder_text, len(tronder_text)) ttws_result = text_to_word_sequence(tronder_text) print(ttws_result) print(one_hot_result) print(len(ttws_result)) print(len(one_hot_result))