def classify_string(self, string): num_features = 1000 # Why am I setting this? max_len = 50 # Why am I setting this? z = np.array(one_hot(string_explode(string), num_features, filters = '')) z.shape = (1, z.shape[0]) z = sequence.pad_sequences(z, max_len) return self.levs[self.model.predict(z, batch_size = 1).argmax()]
def transform(self, sentence): x = text.one_hot(sentence, self.max_features, lower=True, filters=" ") x_new = [[0 if t != i else 1 for i in range(self.max_features)] for t in x] null_vector = [0] * self.max_features pad_size = self.maxlen - len(x_new) for i in range(pad_size): x_new.append(null_vector) return x_new
def _format_x(self, z, words): return sequence.pad_sequences( [ one_hot(string_explode(x, words = words), self.num_features, filters = '') for x in z ], maxlen = self.max_len, # truncating = 'post' )
def input_data(): train_file = "3.25-data.txt" test_file = "test.txt" train_words = [] train_tags = [] X = [] Y = [] test_words = [] test_tags = [] with open(train_file, 'r') as f1: for line in f1: tks = line.split('\t', 1) word = tks[0] # word = jieba.cut(word, cut_all=True) words = "" for i in word: words += i + " " words = words[:len(words)-1].encode('utf8') x = one_hot(n=10000, text=words) if len(x) > 300: print len(x) try: tag = tks[1] if tag == "预警\n": tag = [1, 0] else: tag = [0, 1] train_words.append(x) train_tags.append(tag) except: pass # print train_words[0] index = [i for i in range(len(train_words))] train_words = pad_sentences(train_words) train_tags = np.concatenate([train_tags], 0) random.shuffle(index) for i, j in enumerate(train_words): if i < 0.1 * len(train_words): test_words.append(train_words[index[i]]) test_tags.append(train_tags[index[i]]) else: X.append(train_words[index[i]]) Y.append(train_tags[index[i]]) # with open(test_file, 'r') as f1: # for line in f1: # tks = line.split('\t', 1) # word = tks[0] # tag = tks[1] # test_words.append(word) # test_tags.append(tag) return X, Y, test_words, test_tags
def preprocess(tweet): tweet = re.sub('@\w+', ' ', tweet) tweet = re.sub('[^A-Za-z1-9!? ]', ' ', tweet) tweet = tweet.lower() # stop_words = set(stopwords.words('english')) # tweet = word_tokenize(tweet) # tweet = [w for w in tweet if not w in stop_words] # tweet = ' '.join(tweet) tweet = one_hot(tweet, 3000, lower=False) processed = pad_sequences([tweet], 35, padding='post', truncating='post') return processed
def transform_keywords(self, file_name): inf_file = open(file_name) data = list() for one_news in inf_file.readlines(): single = one_news.strip().split(',') mapping = list() for one_keyword in single: mapping.append(one_hot(one_keyword, 7000)[0]) data.append(mapping) #print(data) return data
def transform_titles(self, file_name): inf_file = open(file_name) data = list() for one_news in inf_file.readlines(): single = nltk.word_tokenize(self.clean_sentence(one_news)) mapping = list() for one_keyword in single: mapping.append(one_hot(one_keyword, 7000)[0]) data.append(mapping) # print(data) return data
def one_encoding(data): words = set( text_to_word_sequence(str(data), filters="!”#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t'\n")) size = len(words) result = one_hot(str(data), round(size * 1.3)) label = [] train_data = [] test_data = [] get_id = {} for i in words: a = one_hot(str(i), round(size * 1.3)) b = i label.extend([a]) get_id[b] = str(a) for i in range(round(len(label) * 0.75)): train_data.append(i) for i in range(round(len(label) * 0.25)): test_data.append(i) return train_data, test_data
def transform_titles(text): data = list() for one_news in text: single = nltk.word_tokenize(clean_sentence(one_news)) mapping = list() for one_keyword in single: mapping.append(one_hot(one_keyword, 7000)[0]) data.append(mapping) # print(data) return data
def discriminateur_text(model_disc, text_seq_length, seed_text): encoded = [one_hot(d, vocab_size) for d in seed_text] print(encoded) encoded = pad_sequences(encoded, maxlen=text_seq_length, truncating='pre') y = model_disc.predict([encoded]) print(y) for i in y: if (i < 0.6): print("it's fake") else: print("it's real")
def encoding_question(text): text = clean_text(text) ques_id = np.load('ques_id.npy').item() encoded_ques = [] encoded_ques = [one_hot(text, 1000)] encoded_ques = pad_sequences(encoded_ques, maxlen=55, padding='post') encoded_ques = np.array(encoded_ques) encoded_ques = np.reshape(encoded_ques, [1, 55]) return encoded_ques
def encode_artist(self, X_artist): # Integer encode artist names # We will estimate the vocabulary size of (unique_artists*1000), which is much larger than needed to # reduce the probability of collisions from the hash function. if self.vocab_size is None: self.vocab_size = len(X_artist['artist'].unique()) * 1000 for idx, row in X_artist.iterrows(): X_artist.at[idx, 'artist'] = one_hot(row['artist'], self.vocab_size) return X_artist
def one_hot_vec(train_data1, test_data1): train_data1 = np.asarray(train_data1) test_data1 = np.asarray(test_data1) X = np.concatenate((train_data1, test_data1), axis=0) vocabulory = np.unique(np.hstack(X)) print '---length---: ', len(vocabulory) X_train = [] X_test = [] length = 0 print '--- one hot encoding ---' for i in train_data1: temp = text.one_hot(' '.join(i), len(vocabulory)) length = max(length, len(temp)) X_train.append(temp) for i in test_data1: temp = text.one_hot(' '.join(i), len(vocabulory)) length = max(length, len(temp)) X_test.append(temp) X_train = sequence.pad_sequences(X_train, maxlen=500) X_test = sequence.pad_sequences(X_test, maxlen=500) return np.asarray(X_train), np.asarray(X_test), len(vocabulory)
def predict(self, text) : ts = self.__cws.text_to_sequence(text) t0 = ' '.join(ts) t1 = one_hot(t0, self.__vocab_size) t2 = [t1] t3 = pad_sequences(t2, maxlen=self.__docs_max_length, padding='post') out = self.model.predict(t3) result = out[0][0] return result
def getFeatureMatrix(self, df): if cfg.input_type == "text": from keras.preprocessing.text import one_hot from keras.preprocessing.sequence import pad_sequences textconverter = lambda x: x if sys.version_info[0] == 2: textconverter = lambda x: x.encode("utf-8") X = pad_sequences( df.apply(lambda row: one_hot( textconverter(row[self.text_field]), self.vocabulary_size), axis=1), self.word_limit) self.fields = [cfg.text_field] self.input_shape = (self.word_limit, ) elif self.objective == "time_series": num_series = 1 + len(self.fields) data = [df[self.target].tolist()] num_rows = len(data[0]) for field in self.fields: data.append(df[field].tolist()) instances = [] target_instances = [] for index in range(num_rows - (self.window_size + 1)): windows = [] for windex in range(self.window_size): series = [] for sindex in range(num_series): series.append(data[sindex][index + windex]) windows.append(series) target_window = [] for sindex in range(num_series): target_window.append(data[sindex][index + self.window_size]) instances.append(windows) target_instances.append(target_window) X = np.array(instances) self.seqtargets = np.array(target_instances) X = np.reshape(X, (X.shape[0], self.window_size, num_series)) print(X.shape) self.input_shape = (self.window_size, num_series) else: X = df.as_matrix(self.fields) self.input_shape = (len(self.fields), ) self.model_metadata["predictors"] = self.fields return X
def pre_process(self): print("Loading...") pos_x, neg_x = self.util.get_data() pos_y = [[1, 0] for i in pos_x] neg_y = [[0, 1] for j in neg_x] print("Spliting..") pos_x_train, pos_x_test, pos_y_train, pos_y_test = train_test_split( pos_x, pos_y, test_size=0.30, random_state=42) neg_x_train, neg_x_test, neg_y_train, neg_y_test = train_test_split( neg_x, neg_y, test_size=0.30, random_state=42) X_train = np.concatenate((pos_x_train, neg_x_train), axis=0) Y_train = np.concatenate((pos_y_train, neg_y_train), axis=0) X_test = np.concatenate((pos_x_test, neg_x_test), axis=0) Y_test = np.concatenate((pos_y_test, neg_y_test), axis=0) X_train_encode = [one_hot(d, 2000) for d in X_train] X_test_encode = [one_hot(d, 2000) for d in X_test] X_train = pad_sequences(X_train_encode, maxlen=200, padding='post') X_test = pad_sequences(X_test_encode, maxlen=200, padding='post') Y_train = np.array(Y_train) X_test = np.array(X_test) X_train, Y_train = shuffle(X_train, Y_train, random_state=10) X_test, Y_test = shuffle(X_test, Y_test, random_state=10) return X_train, Y_train, X_test, Y_test
def predict(self, text): ts = self.__cws.text_to_sequence(text) t0 = ' '.join(ts) print text, t0 t1 = one_hot(t0, self.__vocab_size) t2 = [t1] print t2 t3 = pad_sequences(t2, maxlen=self.__docs_max_length, padding='post') print t3 out = self.model.predict(t3) print out
def one_hot_encode(docs, vocab_size, max_length_factor): ''' First converts a text to a sequence of words (or tokens). Then with keras.preprocessing.text.one_hot() whic is a wrapper for the hashing_trick() function, returns an integer encoded version of the document. The use of a hash function means that there may be collisions and not all words will be assigned unique integer values. Finally pads sequences to the same length.''' wordsequence = [text_to_word_sequence(str(d)) for d in docs] encoded_docs = [one_hot(str(d), vocab_size) for d in wordsequence] padded_docs = pad_sequences(encoded_docs, maxlen=(len(max(encoded_docs, key=len)) * max_length_factor), padding='post') return padded_docs
def test_diff(self, data, labels): self.labels = labels D = data vocab_size = 300 max_length = 200 embedding_vector_length = 32 emb_turn1 = sequence.pad_sequences( [one_hot(d, vocab_size) for d in D["turn1"]], maxlen=max_length) emb_turn2 = sequence.pad_sequences( [one_hot(d, vocab_size) for d in D["turn2"]], maxlen=max_length) emb_turn3 = sequence.pad_sequences( [one_hot(d, vocab_size) for d in D["turn3"]], maxlen=max_length) D = D.drop(['turn1', 'turn2', 'turn3'], axis=1).values results = self.model.evaluate([D, emb_turn1, emb_turn2, emb_turn3], self.labels, batch_size=32) print(results) print("Done testing") return results
def preprocess(self): global review_int #Stemming & Remove Stopwords data2 = data wn = nltk.wordnet.WordNetLemmatizer() lc = nltk.stem.SnowballStemmer('english') sw = set(stopwords.words('english')) hasStop = data2['text'].tolist() noStop = [] for item in hasStop: filtered = [] wt = word_tokenize(item) for wo in wt: if wo == "not": filtered.append(wo) elif not wo in sw: filtered.append(wo) filtered = [wn.lemmatize(w) for w in filtered] filtered = [lc.stem(w) for w in filtered] noStop.append(' '.join(filtered)) temp = pd.Series(noStop) data2['text'] = temp.values #Embedding Word with open('vocab.json','r') as json_data: voc = json.load(json_data) from keras.preprocessing.text import one_hot import random dataList = data2['text'].tolist() vocab_int = voc data3 =[] vocab_size = 200 for item in dataList: notDone = True temp1 = (one_hot(item,vocab_size)) temp2 = item.split() for i in range(len(temp2)): if temp2[i] in vocab_int: continue else: while notDone: if temp1[i] in vocab_int.values(): temp1[i] = random.randrange(1, vocab_size) else: notDone = False vocab_int[temp2[i]] = temp1[i] data3.append(temp1) review_int = data3
def prediction(self,user_text): # Encode the text encoded_docs = [one_hot(user_text, conf_keras_first_go.vocab_size)] # pad documents to a max length padded_text = pad_sequences(encoded_docs, maxlen=conf_keras_first_go.max_length, padding='post') # Prediction based on model prediction = self.model.predict(padded_text) # Decode the prediction encoder = LabelBinarizer() encoder.fit(self.test_labels) result = encoder.inverse_transform(prediction) return result[0]
def test(self, D): self.labels = pd.get_dummies(D[output_emocontext]) D = D.drop(output_emocontext, axis=1) vocab_size = 300 max_length = 200 embedding_vector_length = 32 emb_turn1 = sequence.pad_sequences( [one_hot(d, vocab_size) for d in D["turn1"]], maxlen=max_length) emb_turn2 = sequence.pad_sequences( [one_hot(d, vocab_size) for d in D["turn2"]], maxlen=max_length) emb_turn3 = sequence.pad_sequences( [one_hot(d, vocab_size) for d in D["turn3"]], maxlen=max_length) D = D.drop(['turn1', 'turn2', 'turn3'], axis=1).values results = self.model.evaluate([D, emb_turn1, emb_turn2, emb_turn3], self.labels, batch_size=32) print(results) print("Done testing") return results
def load_data(self): # load train data x_train = [one_hot(q, self._vocab_size) for q in self._questions] x_train = pad_sequences(x_train, maxlen=self._max_length, padding='post') x_train = np.array(x_train) top_answers, answers_info = self.get_top_answers() print(answers_info) y_train = [] for i in range(len(x_train)): rand = random.randint(0, 999) y_train.append(answers_info[top_answers[rand][0]]) # Store the unique IDs of answers unique_answers = {key: 1 for key in y_train} self.targets_size = len(unique_answers) # load val dataset x_val = [one_hot(q, self._vocab_size) for q in self._questions_val] x_val = pad_sequences(x_val, maxlen=self._max_length, padding='post') x_val = np.array(x_val) y_val = [] for i in range(len(x_val)): rand = random.randint(0, 999) y_val.append(answers_info[top_answers[rand][0]]) # Encoding the output data y_train, y_val = np.array(y_train), np.array(y_val) encoder = LabelEncoder() encoder.fit(y_train) encoder.fit(y_val) encoded_y_train, encoded_y_val = encoder.transform( y_train), encoder.transform(y_val) y_train, y_val = np_utils.to_categorical( encoded_y_train), np_utils.to_categorical(encoded_y_val) return x_train, y_train, x_val, y_val
def load_data(vocab_size, num_classes): with open('/Users/nadeau/Documents/Metagenome_Classification/train_test_set/X_test150.pickle', 'rb') as f: x_test = pickle.load(f) # make sequences into sentances of words # https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/ x_test = [[letter for letter in word] for word in x_test] x_test = [" ".join(letters) for letters in x_test] f.close() with open('/Users/nadeau/Documents/Metagenome_Classification/train_test_set/X_train150.pickle', 'rb') as f: x_train = pickle.load(f) x_train = [[letter for letter in word] for word in x_train] x_train = [" ".join(letters) for letters in x_train] f.close() with open('/Users/nadeau/Documents/Metagenome_Classification/train_test_set/y_test150.pickle', 'rb') as f: y_test_str = pickle.load(f) y_test = enumerate_y_labels(y_test_str) f.close() with open('/Users/nadeau/Documents/Metagenome_Classification/train_test_set/y_train150.pickle', 'rb') as f: y_train_str = pickle.load(f) y_train = enumerate_y_labels(y_train_str) f.close() # integer encode the "words" in sequences x_test = [[one_hot(s, vocab_size)] for s in x_test] x_test = np.array(x_test) x_train = [[one_hot(s, vocab_size)] for s in x_train] x_train = np.array(x_train) print('x train shape: {}'.format(x_train.shape)) # covert int y label vectors to one hot matrices y_test_1D = y_test y_train_1D = y_train y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) print('y train shape: {}'.format(y_train.shape)) return y_train_1D, y_test_1D, x_train, y_train, x_test, y_test
def __split_and_one_hot_and_padded_docs (self, docs) : enc_docs = [] for doc in docs : ts = self.__cws.text_to_sequence(doc) t0 = ' '.join(ts) x = one_hot(t0, self.__get_vocab_size()) enc_docs.append(x) pad_docs = pad_sequences(enc_docs,\ maxlen=self.__get_docs_max_length(), padding='post') return pad_docs
def generate(skip=False): with open('database/banki_ru_train.csv') as f: reader = csv.reader(f) first = True if skip: m = random.randint(1, 40000) for row in reader: if first: first = False continue x_train = text.one_hot(row[2], vocab_size) y_train = np.array(percent(row[2])).reshape(-1, 1) x_train = sequence.pad_sequences([x_train], maxlen=max_len) res = x_train, y_train yield res
def preprocess_data(text): text = text text=re.sub('[^a-zA-Z0-9]',' ',text) # Remove special characters(punctuations) and numbers text=text.lower() # Convert to lower case text=text.split() # Tokenization text = [word for word in text if word not in sw] # Removing stopwords text = [lemma.lemmatize(word=w,pos='v') for w in text] # lemmatization text = [k for k in text if len(k)>2] # Remove words with length < 2 text = ' '.join(text) ohe = [one_hot(word, vocab_size) for word in text] padded = pad_sequences(ohe, padding=padding_type, truncating=trunc_type) fd = (pd.DataFrame(padded)).transpose() return fd
def preprocess_data(stored_contents): from keras.preprocessing.text import text_to_word_sequence, one_hot from keras.preprocessing.sequence import pad_sequences #see: https://machinelearningmastery.com/prepare-text-data-deep-learning-keras/ # tokenize the document word_sequence=text_to_word_sequence(filter_data(stored_contents)) words = set(word_sequence) #set() "groups by" the characters filtering duplicaded ones vocab_size=len(words) #getting vocabulary size, this will be the input tokenized_array=one_hot(stored_contents, round(vocab_size)) #one hot encoding input data #data_to_predict = pad_sequences(tokenized_array, maxlen = 9000) #return data_to_predict return tokenized_array
def word_vectorizing_keras(csv_file, max_features, max_len): """ векторизация текстов с помощью встроенного keras-функционала :param csv_file: датасет :param max_features: размер алфавита :param max_len: макс длина вектора-текста :return: """ labels = np.asarray(list(csv_file['1'])).astype('float32') texts = list(csv_file['0']) X = [one_hot(text, max_features) for text in texts] X = pad_sequences(X, maxlen=max_len) return X, labels
def embedded(data, v_size): from keras.preprocessing.text import one_hot dataList = data['text'].tolist() vocab_int = {} encoded = [] vocab_size = v_size for item in dataList: temp1 = (one_hot(item, vocab_size)) temp2 = item.split() for i in range(len(temp2)): if temp2[i] in vocab_int: continue else: vocab_int[temp2[i]] = temp1[i] encoded.append(temp1) return encoded, vocab_int
def generate(skip=False): with open('database/banki_ru_train.csv') as f: reader = csv.reader(f) first = True if skip: m = random.randint(1, 40000) for row in reader: if first: first = False continue x_train = text.one_hot(row[2], vocab_size) y_train = int(row[4]) x_train = sequence.pad_sequences([x_train], maxlen=max_len) y_train = np_utils.to_categorical([y_train], 6) res = x_train, y_train yield res
def format_testcase(self, string, type, max_len): #titles single = list() if type == 0: single = nltk.word_tokenize(self.clean_sentence(string)) #keywords else: single = string mapping = list() for one_keyword in single: mapping.append(one_hot(one_keyword, 7000)[0]) while len(mapping) < max_len: mapping.append(0) data = list() data.append(mapping) print(data) return mapping
def input_data_gen(): train_file = "total-data.txt" train_words = [] train_tags = [] X = [] Y = [] test_words = [] test_tags = [] with open(train_file, 'r') as f1: for line in f1: # line = line.decode('utf-8') tks = line.split('-0-') # print tks word = tks[0] x = one_hot(n=10000, text=word) # try: # print tks tag = tks[1] if tag == "+": tag = [1, 0, 0] elif tag == "-": tag = [0, 1, 0] else: tag = [0, 0, 1] train_words.append(x) train_tags.append(tag) # except Exception as e: # print e.message # print train_words[0] index = [i for i in range(len(train_words))] train_words = pad_sentences(train_words) train_tags = np.concatenate([train_tags], 0) random.shuffle(index) for i, j in enumerate(train_words): if i < 0.1 * len(train_words): test_words.append(train_words[index[i]]) test_tags.append(train_tags[index[i]]) else: X.append(train_words[index[i]]) Y.append(train_tags[index[i]]) return X, Y, test_words, test_tags
def genData(): #X, Y arrays for all data X = [] Y = [] #generate 20% as test set train_count=0 validate_count = 0 test_count = 0 line_count = 0 with open('./hs.csv', encoding='latin-1') as csv_file: csv_reader = csv.reader(csv_file, delimiter='\t') for row in csv_reader: X.append(row[0]) Y.append(row[1]) line_count += 1 print("actual vocab size",len(set(X))) output_size = len(set(Y)) #encode the vocab encoded_X = [one_hot(d, vocab_size) for d in X] padded_X = pad_sequences(encoded_X, maxlen=max_len, padding="post") encoder = LabelEncoder() encoder.fit(Y) encoded_Y = encoder.transform(Y) Y_one_hot = np_utils.to_categorical(encoded_Y) #Training, Test X_train = [] Y_train = [] X_test = [] Y_test = [] # generate sets for train, test and validate np.random.seed(1) for i in range(line_count): if(round(np.random.rand()*100) < 81): X_train.append(padded_X[i]) Y_train.append(Y_one_hot[i]) train_count = train_count+1 else: X_test.append(padded_X[i]) Y_test.append(Y_one_hot[i]) test_count = test_count+1 return X_train, Y_train, X_test, Y_test, output_size
def get_iemocap_data(self): X, Y = self.iemocap_util.read_iemocap_data() X, Y = shuffle(X, Y, random_state=42) Y = [ self.encode_class(y, ["Positive", "Neutral", "Negative"]) for y in Y ] X = [one_hot(d, 2000) for d in X] X = pad_sequences(X, maxlen=50, padding='post') x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42) X_train = np.array(x_train) Y_train = np.array(y_train) X_test = np.array(x_test) Y_test = np.array(y_test) return X_train, Y_train, X_test, Y_test
# text = text.lower() tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') doc = tokenizer.tokenize(text) # Stopword removal # DONE: Add your code here. Store results to a list with name 'doc' # doc = [word for word in doc if word not in stopwords] # Stemming # DONE: Add your code here. Store results to a list with name 'doc' # stemmer = PorterStemmer() doc = [stemmer.stem(word) for word in doc] # Convert list of words to one string doc = ' '.join(w for w in doc).encode('ascii') doc = one_hot(doc, vocab_size, split=' ') data[doc_id] = doc # list data contains the preprocessed document data_train, data_test, labels_train, labels_test = cross_validation.train_test_split(data, labels, test_size=0.4, random_state=1033) # Model learning and prediction # TODO: test different learning algorithms y_train = np.array(labels_train) y_test = np.array(labels_test) y_train = (y_train == 1).astype('float32') y_test = (y_test == 1).astype('float32') print("Pad sequences (samples x time)")
line = line.strip().decode("ascii", "ignore").encode("utf-8") if len(line) == 0: continue lines.append(line) fin.close() sents = nltk.sent_tokenize(" ".join(lines)) tokenizer = Tokenizer(5000) # use top 5000 words only tokens = tokenizer.fit_on_texts(sents) vocab_size = len(tokenizer.word_counts) + 1 xs = [] ys = [] for sent in sents: embedding = one_hot(sent, vocab_size) triples = list(nltk.trigrams(embedding)) w_lefts = [x[0] for x in triples] w_centers = [x[1] for x in triples] w_rights = [x[2] for x in triples] xs.extend(w_centers) ys.extend(w_lefts) xs.extend(w_centers) ys.extend(w_rights) ohe = OneHotEncoder(n_values=vocab_size) X = ohe.fit_transform(np.array(xs).reshape(-1, 1)).todense() Y = ohe.fit_transform(np.array(ys).reshape(-1, 1)).todense() Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=42) print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop') if (mode =='train'): #save all checkpoints checkpointer = ModelCheckpoint(filepath=fdir+"/weights.hdf5", verbose=1, save_best_only=False) history = LossHistory() sample = Sample() print("Training...") for e in range(nb_epoch): print("epoch %d" % e) #for X_batch,Y_batch in zip(batches_X,batches_Y): for i, batch in enumerate(batches_X): X_batch= batches_X[i] Y_batch = one_hot(batches_Y[i],max_features_Y) model.fit(X_batch, Y_batch, batch_size=batch_size, nb_epoch=1, validation_split=0.1, callbacks=[checkpointer,history,sample]) f = file(fdir+'/losses.pkl', 'wb') pkl.dump(history.losses, f, protocol=pkl.HIGHEST_PROTOCOL) f.close() else: preds = model.predict_classes(X_test,batch_size=1, verbose=1) print(preds[0]) get_activations = theano.function([model.layers[3].input], model.layers[4].output(train=False), allow_input_downcast=True) activations = get_activations(X_test) print (activations.shape)
# a = ["a d d", "d a"] # a = ["我是一个爱生活的人", "他也是一个爱生活的人"] # one_h = one_hot(filters=base_filter(), n=30, text=a) # # o.fit_on_texts(a) # # b = one_h(a) # print one_hot(filters=base_filter(), n=30, text=a) # print one_hot(filters=base_filter(), n=30, text=a) # a=['hello world', 'foo bar'] # tokenizer = Tokenizer() # train_tokens = tokenizer.fit_transform(a) # print train_tokens # comma_tokenizer = lambda x: jieba.cut(x, cut_all=True) # from sklearn.feature_extraction.text import HashingVectorizer # v = HashingVectorizer(tokenizer=comma_tokenizer, n_features=30000, non_negative=True) # train_data = v.fit_transform(a) # print train_data # import jieba a = "我是一个男孩" c = jieba.cut(a, cut_all=False) w = "" # print(", ".join(c)) for i in c: w += i + " " # print i w = w[:len(w)-1].encode('utf8') # w = "我 是 一个男孩" print one_hot(filters=base_filter(), n=30000, text=w) # print w # # print c.next()
def test_one_hot(): text = 'The cat sat on the mat.' encoded = one_hot(text, 5) assert len(encoded) == 6 assert np.max(encoded) <= 4 assert np.min(encoded) >= 0
def one_hot(word_model, n): return text.one_hot( word_model, n, filters=text_filter(), lower=False, split=" ")
#!/usr/bin/python3 # coding: utf-8 # https://github.com/EliasCai/sentiment/blob/master/sentiment_words.py#L78 from keras.preprocessing.text import Tokenizer from keras.preprocessing.text import text_to_word_sequence from keras.preprocessing.text import one_hot from keras.preprocessing.text import hashing_trick ################################################################## ## 1. text_to_word_sequence, one_hot, hashing_trick texts = ['some thing to eat', 'some thing to drink'] print(text_to_word_sequence(texts[0])) # ['some', 'thing', 'to', 'eat']; 简单的空格分开 print(one_hot(texts[0], 10)) # [5, 7, 5, 7]; (10 表示数字化向量为 10 以内的数字) print(one_hot(texts[1], 10)) # [5, 7, 5, 5]; 因为内部调用了 hash, 所以能够在定了 (text, n) 之后对每个 str 赋值相同 # This is a wrapper to the `hashing_trick` function using `hash` as the hashing function, unicity of word to index mapping non-guaranteed. ################################################################## ## 2. Tokenizer: 索引就是出现的先后位置 # keras.preprocessing.text.Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n', lower=True, split=" ", char_level=False) # Tokenizer 是一个用于向量化文本, 或将文本转换为序列(即单词在字典中的下标构成的列表, 从 1 算起)的类. # num_words: None 或整数, 处理的最大单词数量. 若被设置为整数, 则分词器将被限制为待处理数据集中最常见的 num_words 个单词 # char_level: 如果为 True, 每个字符将被视为一个标记 texts = ['some thing to eat', 'some thing to drink'] tmp_tokenizer = Tokenizer(num_words=None) # num_words:None 或整数, 处理的最大单词数量; 少于此数的单词丢掉 tmp_tokenizer.fit_on_texts(texts) # tmp_tokenizer.fit_on_texts(texts[0]); tmp_tokenizer.fit_on_texts(texts[1]) # 不能这样, 会按单个字母来统计 # 属性 print(tmp_tokenizer.word_counts) # OrderedDict([('some', 2), ('thing', 2), ('to', 2), ('eat', 1), ('drink', 1)]); 在训练期间出现的次数 print(tmp_tokenizer.word_docs) # {'thing': 2, 'eat': 1, 'to': 2, 'some': 2, 'drink': 1}; 在训练期间所出现的文档或文本的数量 print(tmp_tokenizer.word_index) # {'some': 1, 'thing': 2, 'to': 3, 'eat': 4, 'drink': 5}; 排名或者索引 print(len(tmp_tokenizer.word_index)) # 5; 词典长度 print(tmp_tokenizer.index_docs) # {2: 2, 4: 1, 3: 2, 1: 2, 5: 1}; 将 word_index 和 word_docs 合并 print(tmp_tokenizer.document_count) # 2; 训练文档数
def get_input_data(train_file="rm_result.txt", test_file=None, split=0.1, label_func=get_label_rm): X = [] Y = [] train_words = [] train_tags = [] test_len = 0 if test_file is not None: with open(test_file, 'r') as f1: for line in f1: line = line.replace("\n", "") tks = line.split('-0-') word = tks[0] x = one_hot(n=10000, text=word) if len(x) > 500: continue try: tag = label_func(tks[1]) train_words.append(x) train_tags.append(tag) except: pass test_len = len(train_words) with open(train_file, 'r') as f1: for line in f1: line = line.replace("\n", "") tks = line.split('-0-') word = tks[0] x = one_hot(n=10000, text=word) if len(x) > 500: continue try: tag = label_func(tks[1]) train_words.append(x) train_tags.append(tag) except: pass # print train_words[0] index = [i for i in range(len(train_words))] print "padding" train_words = pad_sentences(train_words) train_tags = np.concatenate([train_tags], 0) print "end padding" if test_file is None: random.shuffle(index) test_len = int(split * len(train_words)) test_words = [] test_tags = [] for i, j in enumerate(train_words): if i < test_len: test_words.append(train_words[index[i]]) test_tags.append(train_tags[index[i]]) else: X.append(train_words[index[i]]) Y.append(train_tags[index[i]]) return X, Y, test_words, test_tags
__author__ = 'bohaohan' # from keras.datasets import imdb # from nltk.stem import WordNetLemmatizer # (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=20, # test_split=0.2) # for i in X_test: # print i # print WordNetLemmatizer().lemmatize("lives") # import nltk # nltk.download() from keras.datasets import imdb, reuters from get_data import input_data max_features = 20000 maxlen = 100 # cut texts after this number of words (among top max_features most common words) batch_size = 32 print('Loading data...') # (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2) # (X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=1000, test_split=0.2) # X_train, y_train, X_test, y_test = input_data() # print(len(X_train), 'train sey_trainquences') # print(len(X_test), 'test sequences') # print(X_train[0], 'train sequences') # tokenizer = Tokenizer(nb_words=1000) # X_train = sequence.pad_sequences(X_train, maxlen=100) # print(X_train[0], 'train sequences') from keras.preprocessing.text import one_hot x = "你 我 他" print one_hot(n=10000, text=x)
#!/usr/bin/python3 # coding: utf-8 # 参考: [use-word-embedding-layers-deep-learning-keras](https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/) from keras.preprocessing.text import one_hot from keras.preprocessing.sequence import pad_sequences from keras.models import Sequential from keras.layers import Dense, Flatten from keras.layers.embeddings import Embedding ################################################################## ## 一: Embedding 简介 # We will define a small problem where we have 10 text documents, each with a comment about a piece of work a student submitted. # Each text document is classified as positive "1" or negative "0". This is a simple sentiment analysis problem. docs = ['Well done!', 'Good work', 'Great effort', 'nice work', 'Excellent!', 'Weak', 'Poor effort!', 'not good', 'poor work', 'Could have done better.'] labels = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0] # define class labels vocab_size = 50 # integer encode the documents; 词典长度, 50 能有效减少 Hash 碰撞... encoded_docs = [one_hot(d, vocab_size) for d in docs] # 使用 one_hot, 给每个单词编一个号, 编号可能重复, 所以 vocab_size 要大 print(encoded_docs) # [[18, 44], [37, 9], [34, 24], [39, 9], [44], [39], [36, 24], [9, 37], [36, 9], [29, 39, 44, 49]] # one_hot() 只是一种方法, 类似于 tf-idf, 词袋模型 # Keras prefers inputs to be vectorized and all inputs to have the same length max_length = 4 # pad documents to a max length of 4 words padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post') print(padded_docs) # [[18 44 0 0] [37 9 0 0] [34 24 0 0] [39 9 0 0] [44 0 0 0] [39 0 0 0] [36 24 0 0] [ 9 37 0 0] [36 9 0 0] [29 39 44 49]] # 至此, 每个 document 已经用 4 维的向量来表示了, We are now ready to define our Embedding layer as part of our neural network model. # The Embedding has a vocabulary of 50 and an input length of 4. We will choose a small embedding space of 8 dimensions. ################################################################## ## Embedding(input_dim, output_dim, input_length) 定义模型; (字典大小, 词向量大小, 每句话单词个数) # Embedding requires that the input data be integer encoded, so that each word is represented by a unique integer. # This data preparation step can be performed using the Tokenizer API also provided with Keras. # input_dim: This is the size of the vocabulary in the text data. For example, if your data is integer encoded to values between 0-10, # then the size of the vocabulary would be 11 words. # output_dim: This is the size of the vector space in which words will be embedded. It defines the size of the output vectors