def ch_to_index(self, text, tok=None): sequences = [] if tok is None: tokenizer = Tokenizer(lower=False, char_level=True) all_of_them = [' '.join(z) for z in text] tokenizer.fit_on_texts(all_of_them) else: tokenizer = tok for words in text: charaters = [] for ch in tokenizer.texts_to_sequences_generator(words): charaters.append(ch) sequences.append(charaters) return sequences, tokenizer
def test_tokenizer(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] tokenizer = Tokenizer(num_words=10) tokenizer.fit_on_texts(texts) sequences = [] for seq in tokenizer.texts_to_sequences_generator(texts): sequences.append(seq) assert np.max(np.max(sequences)) < 10 assert np.min(np.min(sequences)) == 1 tokenizer.fit_on_sequences(sequences) for mode in ['binary', 'count', 'tfidf', 'freq']: matrix = tokenizer.texts_to_matrix(texts, mode)
print "Tokenizing sentences..." for i, review in enumerate(reviews_texts): print '{} of {}'.format(i, len(reviews_texts)) sentences.append([x.lower_.encode('ascii',errors='ignore') for x in nlp(review)]) from keras.preprocessing.text import Tokenizer tk = Tokenizer() tk.fit_on_texts((t.encode('ascii',errors='ignore') for t in reviews_texts)) tk.fit_on_texts((' '.join(t) for t in sentences)) seq_data = [_ for _ in tk.texts_to_sequences_generator((t.encode('ascii',errors='ignore') for t in reviews_texts))] seq_data = [_ for _ in tk.texts_to_sequences_generator((' '.join(t) for t in sentences))] cPickle.dump({'funny' : funny_votes, 'useful' : useful_votes, 'stars' : review_stars, 'partition_range' : 'range(1, 20)', 'sequenced_data' : seq_data, 'meta' : 'Yelp data over the partitions 1 thru 19. sequenced_data is an embedding from the Keras Tokenizer'}, open('data-dump-1-19.pkl', 'wb'), cPickle.HIGHEST_PROTOCOL)
# LSTM lstm_output_size = 70 #####################set up tokenizer###################3 #generator for tokenizer def generator_review_parse(path): g = gzip.open(path, 'r') for l in g: review_dict = eval(l) yield review_dict['reviewText'] tokenizer = Tokenizer(nb_words=MAX_NB_WORDS) path = './reviews_Electronics_5.json.gz' tokenizer.fit_on_texts(generator_review_parse(path)) sequences = tokenizer.texts_to_sequences_generator(generator_review_parse(path)) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) ################## generator for training neural network ################# def generator_modelData(path,batch_size=1,token_model=tokenizer): g = gzip.open(path, 'r') #here's the proportion of the ratings (10k samples of electronics) cat_props = [0.05293333,0.0406,0.08066667,0.20826667,0.61753333] count = 0 for l in g: if count == 0: reviews, scores, sample_weight = [], [], [] review_dict = eval(l)
class Data(object): def __init__(self): # DataFrames self._df_train = pd.DataFrame() self._df_test = pd.DataFrame() # the actual data self._x_train = np.array([]) self._x_dev = None self._x_test = np.array([]) self._y_train = np.array([]) self._y_dev = None self._train_ids = [] self._dev_ids = None self._test_ids = [] # parameters influencing the creation of the data self._dev_frac = None self._max_len = None self._shuffle = None self._label_smooth_frac = None # Tokenizer and LabelEncoder instances self._tokenizer = Tokenizer() self._label_encoder = LabelEncoder() def prepare_data( self, train_tsv, test_tsv, dev_frac=None, max_len=200, shuffle=True, label_smooth_frac=None, tokenizer=None, ): """Prepares training and test data by converting sentences into sequences of numbers and labels into categorical representations. Args: train_tsv: path to a tsv file containing training data test_tsv: path to a tsv file containing test data dev_frac: fraction of the training data split off and used as the development set max_len: maximal sentence lenght all sentences are padded to shuffle: whether or not to shuffle the training data label_smooth_frac: fraction of probability space distributed from the true example to the others tokenizer: an instance of keras.preprocessing.text.Tokenizer """ self._dev_frac = dev_frac self._max_len = max_len self._shuffle = shuffle self._label_smooth_frac = label_smooth_frac if tokenizer: self._tokenizer = tokenizer self._df_train = self._read_train_tsv(train_tsv, shuffle=self._shuffle) x_train, train_ids = self._prepare_data_x_train(self._df_train) y_train = self._prepare_data_y(self._df_train) self._split_save_train_dev(x_train, y_train, train_ids) self._df_test = self._read_test_tsv(test_tsv) x_test, test_ids = self._prepare_data_x_test(self._df_test) self._save_test(x_test, test_ids) return True def _read_train_tsv(self, train_tsv, shuffle=True): """reads the tsv containing the training data into a DataFrame""" df = pd.read_csv( train_tsv, delimiter="\t", names=["id", "sentence", "label"], converters={"id": lambda x: str(x)}, ) if shuffle: df = df.sample(frac=1) return df def _read_test_tsv(self, test_tsv): """reads the tsv containing the test data into a DataFrame""" df = pd.read_csv(test_tsv, delimiter="\t", names=["id", "sentence"], converters={"id": lambda x: str(x)}) return df def _prepare_data_x_train(self, df): """fits the Tokenizer, converts sentences into sequences of equal lenght and returns them as well as their ids""" ids = [row["id"] for _, row in df.iterrows()] x_sents = [row["sentence"] for _, row in df.iterrows()] self._tokenizer.fit_on_texts(x_sents) x_sents = [ sequence for sequence in self._tokenizer.texts_to_sequences_generator(x_sents) ] return pad_sequences(x_sents, maxlen=self._max_len), ids def _prepare_data_x_test(self, df): """converts sentences into sequences of equal lenght and returns them as well as their ids""" ids = [row["id"] for _, row in df.iterrows()] x_sents = [row["sentence"] for _, row in df.iterrows()] x_sents = [ sequence for sequence in self._tokenizer.texts_to_sequences_generator(x_sents) ] return pad_sequences(x_sents, maxlen=self._max_len), ids def _prepare_data_y(self, df): """fits the LabelEncoder and converts the labels into categorical representations adding label smoothing""" y_labels = [row["label"] for _, row in df.iterrows()] self._label_encoder.fit(y_labels) y_labels = self._label_encoder.transform(y_labels) return self._label_smoothing(to_categorical(y_labels), self._label_smooth_frac) def _label_smoothing(self, array, label_smooth_frac): """smoothes the numbers in a categorical numpy array row-wise""" if label_smooth_frac: return array * (1 - label_smooth_frac) + ( 1 - array) * label_smooth_frac / (array.shape[1] - 1) else: return array def _split_save_train_dev(self, x_train, y_train, train_ids): """splits the train data into a train and dev set and saves it""" if self._dev_frac: self._x_train = x_train[int(len(x_train) * self._dev_frac):] self._x_dev = x_train[:int(len(x_train) * self._dev_frac)] self._y_train = y_train[int(len(y_train) * self._dev_frac):] self._y_dev = y_train[:int(len(y_train) * self._dev_frac)] self._train_ids = train_ids[int(len(train_ids) * self._dev_frac):] self._dev_ids = train_ids[:int(len(train_ids) * self._dev_frac)] else: self._x_train = x_train self._y_train = y_train self._train_ids = train_ids return True def _save_test(self, x_test, test_ids): """saves the test set""" self._x_test = x_test self._test_ids = test_ids return True def get_df_train(self): """returns the DataFrame read from the train tsv""" return self._df_train def get_df_test(self): """returns the DataFrame read from the test tsv""" return self._df_test def get_train_data(self): """returns the training and development data""" return ( (self._x_train, self._y_train, self._train_ids), (self._x_dev, self._y_dev, self._dev_ids), ) def get_test_data(self): """returns the test data""" return (self._x_test, self._test_ids) def get_positions_e1(self): """returns an array with the position of every token relative to entity one. 0 is reserved for padding tokens, 1 is reserved for entity one. """ pos_e1_train = [] pos_e1_dev = [] pos_e1_test = [] for sent in self._x_train: sent = list(sent) beg_index = sent.index(4) end_index = sent.index(5) new = [] for i in range(len(sent)): if sent[i] == 0: new.append(sent[i]) elif beg_index <= i <= end_index: new.append(1) elif i < beg_index: new.append(i - beg_index) elif i > end_index: new.append(i - end_index + 1) pos_e1_train.append(new) for sent in self._x_test: sent = list(sent) beg_index = sent.index(4) end_index = sent.index(5) new = [] for i in range(len(sent)): if sent[i] == 0: new.append(sent[i]) elif beg_index <= i <= end_index: new.append(1) elif i < beg_index: new.append(i - beg_index) elif i > end_index: new.append(i - end_index + 1) pos_e1_test.append(new) if not self._dev_frac: return ( pad_sequences(pos_e1_train, maxlen=self._max_len), None, pad_sequences(pos_e1_test, maxlen=self._max_len), ) for sent in self._x_dev: sent = list(sent) beg_index = sent.index(4) end_index = sent.index(5) new = [] for i in range(len(sent)): if sent[i] == 0: new.append(sent[i]) elif beg_index <= i <= end_index: new.append(1) elif i < beg_index: new.append(i - beg_index) elif i > end_index: new.append(i - end_index + 1) pos_e1_dev.append(new) return ( pad_sequences(pos_e1_train, maxlen=self._max_len), pad_sequences(pos_e1_dev, maxlen=self._max_len), pad_sequences(pos_e1_test, maxlen=self._max_len), ) def get_positions_e2(self): """returns an array with the position of every token relative to entity two. 0 is reserved for padding tokens, 1 is reserved for entity two. """ pos_e2_train = [] pos_e2_dev = [] pos_e2_test = [] for sent in self._x_train: sent = list(sent) beg_index = sent.index(6) end_index = sent.index(7) new = [] for i in range(len(sent)): if sent[i] == 0: new.append(sent[i]) elif beg_index <= i <= end_index: new.append(1) elif i < beg_index: new.append(i - beg_index) elif i > end_index: new.append(i - end_index + 1) pos_e2_train.append(new) for sent in self._x_test: sent = list(sent) beg_index = sent.index(6) end_index = sent.index(7) new = [] for i in range(len(sent)): if sent[i] == 0: new.append(sent[i]) elif beg_index <= i <= end_index: new.append(1) elif i < beg_index: new.append(i - beg_index) elif i > end_index: new.append(i - end_index + 1) pos_e2_test.append(new) if not self._dev_frac: return ( pad_sequences(pos_e2_train, maxlen=self._max_len), None, pad_sequences(pos_e2_test, maxlen=self._max_len), ) for sent in self._x_dev: sent = list(sent) beg_index = sent.index(6) end_index = sent.index(7) new = [] for i in range(len(sent)): if sent[i] == 0: new.append(sent[i]) elif beg_index <= i <= end_index: new.append(1) elif i < beg_index: new.append(i - beg_index) elif i > end_index: new.append(i - end_index + 1) pos_e2_dev.append(new) return ( pad_sequences(pos_e2_train, maxlen=self._max_len), pad_sequences(pos_e2_dev, maxlen=self._max_len), pad_sequences(pos_e2_test, maxlen=self._max_len), ) def get_entities_and_context(self): """returns an array with only the entities and the context in between them. entity markers were removed. """ context_train = [] context_dev = [] context_test = [] for sent in self._x_train: sent = list(sent) beg_index = sent.index(4) end_index = sent.index(7) new = [] for i in range(len(sent)): if beg_index < i < end_index and sent[i] not in [5, 6]: new.append(sent[i]) context_train.append(new) for sent in self._x_test: sent = list(sent) beg_index = sent.index(4) end_index = sent.index(7) new = [] for i in range(len(sent)): if beg_index < i < end_index and sent[i] not in [5, 6]: new.append(sent[i]) context_test.append(new) if not self._dev_frac: return ( pad_sequences(context_train, maxlen=self._max_len), None, pad_sequences(context_test, maxlen=self._max_len), ) for sent in self._x_dev: sent = list(sent) beg_index = sent.index(4) end_index = sent.index(7) new = [] for i in range(len(sent)): if beg_index < i < end_index and sent[i] not in [5, 6]: new.append(sent[i]) context_dev.append(new) return ( pad_sequences(context_train, maxlen=self._max_len), pad_sequences(context_dev, maxlen=self._max_len), pad_sequences(context_test, maxlen=self._max_len), ) def get_dev_frac(self): """returns the fraction of the training data split off for the development data""" return self._dev_frac def get_max_len(self): """returns the maximal sequence length""" return self._max_len def get_shuffle(self): """returns if the training data have been shuffled""" return self._shuffle def get_label_smooth_frac(self): """returns the fraction of the probability space distributed from the true example to the others""" return self._label_smooth_frac def get_tokenizer(self): """returns the Tokenizer""" return self._tokenizer def get_label_encoder(self): """returns the LabelEncoder""" return self._label_encoder
class Preprocessor: def __init__(self, max_features, max_sent_len, embedding_dims=200, wvs=None, max_doc_len=500, stopword=True): ''' max_features: the upper bound to be placed on the vocabulary size. max_sent_len: the maximum length (in terms of tokens) of the instances/texts. embedding_dims: size of the token embeddings; over-ridden if pre-trained vectors is provided (if wvs is not None). ''' self.max_features = max_features self.tokenizer = Tokenizer(num_words=self.max_features)#num_words=self.max_features) self.max_sent_len = max_sent_len # the max sentence length! @TODO rename; this is confusing. self.max_doc_len = max_doc_len # w.r.t. number of sentences! self.use_pretrained_embeddings = False self.init_vectors = None if wvs is None: self.embedding_dims = embedding_dims else: # note that these are only for initialization; # they will be tuned! self.use_pretrained_embeddings = True # for new gensim format self.embedding_dims = wvs.syn0.shape[1] #wvs.vector_size self.word_embeddings = wvs self.stopword = stopword # lifted directly from spacy's EN list #self.stopwords = [u'all', u'six', u'just', u'less', u'being', u'indeed', u'over', u'move', u'anyway', u'four', u'not', u'own', u'through', u'using', u'fify', u'where', u'mill', u'only', u'find', u'before', u'one', u'whose', u'system', u'how', u'somewhere', u'much', u'thick', u'show', u'had', u'enough', u'should', u'to', u'must', u'whom', u'seeming', u'yourselves', u'under', u'ours', u'two', u'has', u'might', u'thereafter', u'latterly', u'do', u'them', u'his', u'around', u'than', u'get', u'very', u'de', u'none', u'cannot', u'every', u'un', u'they', u'front', u'during', u'thus', u'now', u'him', u'nor', u'name', u'regarding', u'several', u'hereafter', u'did', u'always', u'who', u'didn', u'whither', u'this', u'someone', u'either', u'each', u'become', u'thereupon', u'sometime', u'side', u'towards', u'therein', u'twelve', u'because', u'often', u'ten', u'our', u'doing', u'km', u'eg', u'some', u'back', u'used', u'up', u'go', u'namely', u'computer', u'are', u'further', u'beyond', u'ourselves', u'yet', u'out', u'even', u'will', u'what', u'still', u'for', u'bottom', u'mine', u'since', u'please', u'forty', u'per', u'its', u'everything', u'behind', u'does', u'various', u'above', u'between', u'it', u'neither', u'seemed', u'ever', u'across', u'she', u'somehow', u'be', u'we', u'full', u'never', u'sixty', u'however', u'here', u'otherwise', u'were', u'whereupon', u'nowhere', u'although', u'found', u'alone', u're', u'along', u'quite', u'fifteen', u'by', u'both', u'about', u'last', u'would', u'anything', u'via', u'many', u'could', u'thence', u'put', u'against', u'keep', u'etc', u'amount', u'became', u'ltd', u'hence', u'onto', u'or', u'con', u'among', u'already', u'co', u'afterwards', u'formerly', u'within', u'seems', u'into', u'others', u'while', u'whatever', u'except', u'down', u'hers', u'everyone', u'done', u'least', u'another', u'whoever', u'moreover', u'couldnt', u'throughout', u'anyhow', u'yourself', u'three', u'from', u'her', u'few', u'together', u'top', u'there', u'due', u'been', u'next', u'anyone', u'eleven', u'cry', u'call', u'therefore', u'interest', u'then', u'thru', u'themselves', u'hundred', u'really', u'sincere', u'empty', u'more', u'himself', u'elsewhere', u'mostly', u'on', u'fire', u'am', u'becoming', u'hereby', u'amongst', u'else', u'part', u'everywhere', u'too', u'kg', u'herself', u'former', u'those', u'he', u'me', u'myself', u'made', u'twenty', u'these', u'was', u'bill', u'cant', u'us', u'until', u'besides', u'nevertheless', u'below', u'anywhere', u'nine', u'can', u'whether', u'of', u'your', u'toward', u'my', u'say', u'something', u'and', u'whereafter', u'whenever', u'give', u'almost', u'wherever', u'is', u'describe', u'beforehand', u'herein', u'doesn', u'an', u'as', u'itself', u'at', u'have', u'in', u'seem', u'whence', u'ie', u'any', u'fill', u'again', u'hasnt', u'inc', u'thereby', u'thin', u'no', u'perhaps', u'latter', u'meanwhile', u'when', u'detail', u'same', u'wherein', u'beside', u'also', u'that', u'other', u'take', u'which', u'becomes', u'you', u'if', u'nobody', u'unless', u'whereas', u'see', u'though', u'may', u'after', u'upon', u'most', u'hereupon', u'eight', u'but', u'serious', u'nothing', u'such', u'why', u'off', u'a', u'don', u'whereby', u'third', u'i', u'whole', u'noone', u'sometimes', u'well', u'amoungst', u'yours', u'their', u'rather', u'without', u'so', u'five', u'the', u'first', u'with', u'make', u'once'] self.stopwords = ["a", "about", "again", "all", "almost", "also", "although", "always", "among", "an", "and", "another", "any", "are", "as", "at", "b", "be", "because", "been", "before", "being", "between", "both", "but", "by", "c", "can", "could", "did", "do", "d", "does", "each", "either", "enough", "etc", "f", "for", "from", "had", "has", "have", "here", "how", "h", "i", "if", "in", "into", "is", "it", "its", "j", "just", "k", "made", "make", "may", "must", "n", "o", "of", "often", "on", "p", "q", "r", "s", "so", "that", "the", "them", "then", "their", "those", "thus", "to", "t", "u", "use", "used", "v", "w", "x", "y", "z", "we", "was"] def remove_stopwords(self, texts): stopworded_texts = [] for text in texts: # note the naive segmentation; although this is same as the # keras module does. #stopworded_text = " ".join([t for t in text.split(" ") if not t.lower() in self.stopwords]) stopworded_text = [] for t in text.split(" "): if not t in self.stopwords: if t.isdigit(): t = "numbernumbernumber" stopworded_text.append(t) #stopworded_text = " ".join([t for t in text.split(" ") if not t in self.stopwords]) stopworded_text = " ".join(stopworded_text) stopworded_texts.append(stopworded_text) return stopworded_texts def preprocess(self, all_docs): ''' This fits tokenizer and builds up input vectors (X) from the list of texts in all_texts. Needs to be called before train! ''' self.raw_texts = all_docs if self.stopword: #for text in self.raw_texts: self.processed_texts = self.remove_stopwords(self.raw_texts) else: self.processed_texts = self.raw_texts self.fit_tokenizer() if self.use_pretrained_embeddings: self.init_word_vectors() def fit_tokenizer(self): ''' Fits tokenizer to all raw texts; remembers indices->words mappings. ''' self.tokenizer.fit_on_texts(self.processed_texts) self.word_indices_to_words = {} for token, idx in self.tokenizer.word_index.items(): self.word_indices_to_words[idx] = token def decode(self, x): ''' For convenience; map from word index vector to words''' words = [] for t_idx in x: if t_idx == 0: words.append("pad") else: words.append(self.word_indices_to_words[t_idx]) return " ".join(words) def build_sequences(self, texts, pad_documents=False): processed_texts = texts if self.stopword: processed_texts = self.remove_stopwords(texts) X = list(self.tokenizer.texts_to_sequences_generator(processed_texts)) # need to pad the number of sentences, too. X = np.array(pad_sequences(X, maxlen=self.max_sent_len)) return X def init_word_vectors(self): ''' Initialize word vectors. ''' self.init_vectors = [] unknown_words_to_vecs = {} for t, token_idx in self.tokenizer.word_index.items(): if token_idx <= self.max_features: try: self.init_vectors.append(self.word_embeddings[t]) except: if t not in unknown_words_to_vecs: # randomly initialize unknown_words_to_vecs[t] = np.random.random( self.embedding_dims)*-2 + 1 self.init_vectors.append(unknown_words_to_vecs[t]) # init padding token! self.init_vectors.append(np.zeros(self.embedding_dims)) # note that we make this a singleton list because that's # what Keras wants. self.init_vectors = [np.vstack(self.init_vectors)]
class Preprocessor: # @TODO setting max_CUI_size to something small for now! def __init__(self, max_vocab_size=10000, max_CUI_size=5000, max_len=40, max_CUI_len=100, wv_embedding_dims=200, CUI_embedding_dims=200, wvs=None, CUI_vs=None): ''' max_vocab_size: maximum number of words to include in the model max_CUI_size: maximum number of CUIs to include in the model max_features: the upper bound to be placed on the vocabulary size. max_len: the maximum length (in terms of tokens) of the text snippets. max_CUI_size: the maximum number of ancestral CUIs to be used for each instance. wv_embedding_dims: size of the token embeddings; over-ridden if pre-trained vectors are provided (if wvs is not None). CUI_embedding_dims: size of the CUI embeddings; over-ridden if pre-trained vectors are provided. wvs: pre-trained embeddings (for embeddings initialization) ''' # inputs self.max_vocab_size = max_vocab_size self.max_CUI_size = max_CUI_size self.max_len = max_len self.max_CUI_len = max_CUI_len self.tokenizer = Tokenizer(nb_words=self.max_vocab_size) # overkill to use a tokenizer, but we'll do it anyway self.CUI_tokenizer = Tokenizer(nb_words=self.max_CUI_size) self.use_pretrained_embeddings = False self.init_vectors = None if wvs is None: self.wv_embedding_dims = wv_embedding_dims self.CUI_embedding_dims = CUI_embedding_dims else: # note that these are only for initialization; # they will be tuned! self.use_pretrained_embeddings = True self.word_embeddings = wvs self.wv_embedding_dims = wvs.vector_size self.CUI_embeddings = CUI_vs self.CUI_embedding_dims = CUI_vs.vector_size def preprocess(self, all_texts, all_CUIs): ''' This fits tokenizer and builds up input vectors (X) from the list of texts in all_texts. Needs to be called before train! ''' self.raw_texts = all_texts self.CUIs = all_CUIs self.fit_tokenizer() self.fit_CUI_tokenizer() if self.use_pretrained_embeddings: print "initializing word vectors.." self.init_word_vectors() print "done. initializing CUI vectors..." self.init_CUI_vectors() print "done." def fit_tokenizer(self): ''' Fits tokenizer to all raw texts; remembers indices->words mappings. ''' self.tokenizer.fit_on_texts(self.raw_texts) self.word_indices_to_words = {} for token, idx in self.tokenizer.word_index.items(): self.word_indices_to_words[idx] = token def fit_CUI_tokenizer(self): ''' Fits tokenizer to all raw texts; remembers indices->words mappings. ''' self.CUI_tokenizer.fit_on_texts(self.CUIs) self.CUI_indices_to_CUIs = {} for CUI, idx in self.CUI_tokenizer.word_index.items(): self.CUI_indices_to_CUIs[idx] = CUI def build_text_sequences(self, texts): X = list(self.tokenizer.texts_to_sequences_generator(texts)) X = np.array(pad_sequences(X, maxlen=self.max_len)) return X def build_CUI_sequences(self, CUIs): X_CUIs = list(self.CUI_tokenizer.texts_to_sequences_generator(CUIs)) X_CUIs = np.array(pad_sequences(X_CUIs, maxlen=self.max_CUI_len)) return X_CUIs def init_word_vectors(self): ''' Initialize word vectors. ''' self.init_word_vectors = [] unknown_words_to_vecs = {} for t, token_idx in self.tokenizer.word_index.items(): if token_idx <= self.max_vocab_size: try: self.init_word_vectors.append(self.word_embeddings[t]) except: if t not in unknown_words_to_vecs: # randomly initialize unknown_words_to_vecs[t] = np.random.random( self.wv_embedding_dims)*-2 + 1 self.init_word_vectors.append(unknown_words_to_vecs[t]) # note that we make this a singleton list because that's # what Keras wants. self.init_word_vectors = [np.vstack(self.init_word_vectors)] def init_CUI_vectors(self): ''' initialize CUI vectors ''' self.init_CUI_vectors = [] unknown_CUIs_to_vecs = {} for CUI, CUI_idx in self.CUI_tokenizer.word_index.items(): if CUI_idx <= self.max_CUI_size: try: self.init_CUI_vectors.append(self.CUI_embeddings[CUI]) except: if CUI not in unknown_CUIs_to_vecs: unknown_CUIs_to_vecs[CUI] = np.random.random(self.CUI_embedding_dims)*-2 + 1 self.init_CUI_vectors.append(unknown_CUIs_to_vecs[CUI]) self.init_CUI_vectors = [np.vstack(self.init_CUI_vectors)]
class ISummarizer: # 100000 def __init__(self, pairs, nb_words=10000, hidden_size=512, max_input_size=3000, max_output_size=15): self.pairs = pairs self.nb_words = nb_words + 2 # number of words; +2 for start and stop tokens! self.max_input_size = max_input_size self.max_output_size = max_output_size + 2 # again +2 for start/stop self.hidden_size = hidden_size print("loading pre-trained word vectors...") self.wv = load_trained_w2v_model() # here you want to add start and stop print("OK!") self.word_embedding_size = self.wv.vector_size # call to sequences # call init_word_vectors print("building sequences...") self.build_sequences() print("initializing word vectors...") self.init_word_vectors() print("ok!") def build_sequences(self): self.tokenizer = Tokenizer(nb_words=self.nb_words) self.raw_input_texts = [START_STR + " " + " ".join(pair[0]) + " " + STOP_STR for pair in self.pairs] self.raw_output_texts = [START_STR + " " + " ".join(pair[1]) + " " + STOP_STR for pair in self.pairs] def _get_max(seqs): return max([len(seq) for seq in seqs]) self.tokenizer.fit_on_texts(self.raw_input_texts+self.raw_output_texts) self.word_indices_to_words = {} for token, idx in self.tokenizer.word_index.items(): self.word_indices_to_words[idx] = token self.input_sequences = list(self.tokenizer.texts_to_sequences_generator(self.raw_input_texts)) #self.max_input_len = _get_max(self.input_sequences) #X_train = sequence.pad_sequences(X_train, maxlen=maxlen) #X_test = sequence.pad_sequences(X_test, maxlen=maxlen) self.input_sequences = list(pad_sequences(self.input_sequences, maxlen=self.max_input_size)) self.output_sequences = list(self.tokenizer.texts_to_sequences_generator(self.raw_output_texts)) self.output_sequences = list(pad_sequences(self.output_sequences, maxlen=self.max_output_size)) def init_word_vectors(self): self.init_vectors = [] unknown_words_to_vecs = {} for t, token_idx in self.tokenizer.word_index.items(): if token_idx <= self.nb_words: try: self.init_vectors.append(self.wv[t]) except: if t not in unknown_words_to_vecs: # randomly initialize unknown_words_to_vecs[t] = np.random.random( self.word_embedding_size)*-2 + 1 self.init_vectors.append(unknown_words_to_vecs[t]) self.init_vectors = np.vstack(self.init_vectors) def build_model(self): self.model = Sequential() self.model.add(Embedding(self.nb_words, self.word_embedding_size, weights=[self.init_vectors])) ### # run embeddings through a Gated Recurrent Unit self.model.add(GRU(self.hidden_size)) #self.model.add(Dropout(0.1)) self.model.add(Dense(self.hidden_size)) self.model.add(Activation('relu')) self.model.add(RepeatVector(self.max_output_size)) self.model.add(GRU(self.hidden_size, return_sequences=True)) self.model.add(Dropout(0.1)) self.model.add(TimeDistributedDense(self.nb_words, activation="softmax")) # does cross entropy make sense here? self.model.compile(loss="categorical_crossentropy", optimizer='adam') return self.model def X_y(self): self.X = np.array(self.input_sequences) # np.zeros((n, self.max_input_size, self.nb_words), dtype=np.bool) self.Y = np.zeros((len(self.output_sequences), self.max_output_size, self.nb_words), dtype=np.bool) for i in range(self.X.shape[0]): #for j, token_idx in enumerate(self.input_sequences[i]): # self.X[i, j, token_idx] = 1 for j, token_idx in enumerate(self.output_sequences[i]): self.Y[i, j, token_idx] = 1 print "X shape: %s; Y shape: %s" % (self.X.shape, self.Y.shape) def decode(self, pred): text = [] for token_preds in pred: ### it keeps predicting zeros! zeros are for the padding... cur_pred_index = np.argmax(token_preds) #+ 1 # the tokenizer seems to do 1-indexing! if cur_pred_index == 0: text.append("<pad>") else: text.append(self.word_indices_to_words[cur_pred_index]) return text def train(self): # @TODO revisit; batchsize, etc print "fitting model..." self.model.fit(self.X, self.Y)
class ISummarizer: # 100000 def __init__(self, pairs, nb_words=10000, hidden_size=512, max_input_size=3000, max_output_size=15): self.pairs = pairs self.nb_words = nb_words + 2 # number of words; +2 for start and stop tokens! self.max_input_size = max_input_size self.max_output_size = max_output_size + 2 # again +2 for start/stop self.hidden_size = hidden_size print("loading pre-trained word vectors...") self.wv = load_trained_w2v_model() # here you want to add start and stop print("OK!") self.word_embedding_size = self.wv.vector_size # call to sequences # call init_word_vectors print("building sequences...") self.build_sequences() print("initializing word vectors...") self.init_word_vectors() print("ok!") def build_sequences(self): self.tokenizer = Tokenizer(nb_words=self.nb_words) self.raw_input_texts = [ START_STR + " " + " ".join(pair[0]) + " " + STOP_STR for pair in self.pairs ] self.raw_output_texts = [ START_STR + " " + " ".join(pair[1]) + " " + STOP_STR for pair in self.pairs ] def _get_max(seqs): return max([len(seq) for seq in seqs]) self.tokenizer.fit_on_texts(self.raw_input_texts + self.raw_output_texts) self.word_indices_to_words = {} for token, idx in self.tokenizer.word_index.items(): self.word_indices_to_words[idx] = token self.input_sequences = list( self.tokenizer.texts_to_sequences_generator(self.raw_input_texts)) #self.max_input_len = _get_max(self.input_sequences) #X_train = sequence.pad_sequences(X_train, maxlen=maxlen) #X_test = sequence.pad_sequences(X_test, maxlen=maxlen) self.input_sequences = list( pad_sequences(self.input_sequences, maxlen=self.max_input_size)) self.output_sequences = list( self.tokenizer.texts_to_sequences_generator(self.raw_output_texts)) self.output_sequences = list( pad_sequences(self.output_sequences, maxlen=self.max_output_size)) def init_word_vectors(self): self.init_vectors = [] unknown_words_to_vecs = {} for t, token_idx in self.tokenizer.word_index.items(): if token_idx <= self.nb_words: try: self.init_vectors.append(self.wv[t]) except: if t not in unknown_words_to_vecs: # randomly initialize unknown_words_to_vecs[t] = np.random.random( self.word_embedding_size) * -2 + 1 self.init_vectors.append(unknown_words_to_vecs[t]) self.init_vectors = np.vstack(self.init_vectors) def build_model(self): self.model = Sequential() self.model.add( Embedding(self.nb_words, self.word_embedding_size, weights=[self.init_vectors])) ### # run embeddings through a Gated Recurrent Unit self.model.add(GRU(self.hidden_size)) #self.model.add(Dropout(0.1)) self.model.add(Dense(self.hidden_size)) self.model.add(Activation('relu')) self.model.add(RepeatVector(self.max_output_size)) self.model.add(GRU(self.hidden_size, return_sequences=True)) self.model.add(Dropout(0.1)) self.model.add( TimeDistributedDense(self.nb_words, activation="softmax")) # does cross entropy make sense here? self.model.compile(loss="categorical_crossentropy", optimizer='adam') return self.model def X_y(self): self.X = np.array( self.input_sequences ) # np.zeros((n, self.max_input_size, self.nb_words), dtype=np.bool) self.Y = np.zeros( (len(self.output_sequences), self.max_output_size, self.nb_words), dtype=np.bool) for i in range(self.X.shape[0]): #for j, token_idx in enumerate(self.input_sequences[i]): # self.X[i, j, token_idx] = 1 for j, token_idx in enumerate(self.output_sequences[i]): self.Y[i, j, token_idx] = 1 print "X shape: %s; Y shape: %s" % (self.X.shape, self.Y.shape) def decode(self, pred): text = [] for token_preds in pred: ### it keeps predicting zeros! zeros are for the padding... cur_pred_index = np.argmax( token_preds) #+ 1 # the tokenizer seems to do 1-indexing! if cur_pred_index == 0: text.append("<pad>") else: text.append(self.word_indices_to_words[cur_pred_index]) return text def train(self): # @TODO revisit; batchsize, etc print "fitting model..." self.model.fit(self.X, self.Y)
class Preprocessor: def __init__(self, max_features, maxlen, embedding_dims=200, wvs=None): ''' max_features: the upper bound to be placed on the vocabulary size. maxlen: the maximum length (in terms of tokens) of the instances/texts. embedding_dims: size of the token embeddings; over-ridden if pre-trained vectors is provided (if wvs is not None). ''' self.max_features = max_features self.tokenizer = Tokenizer(nb_words=self.max_features) self.maxlen = maxlen self.use_pretrained_embeddings = False self.init_vectors = None if wvs is None: self.embedding_dims = embedding_dims else: # note that these are only for initialization; # they will be tuned! self.use_pretrained_embeddings = True self.embedding_dims = wvs.vector_size self.word_embeddings = wvs def preprocess(self, all_texts): ''' This fits tokenizer and builds up input vectors (X) from the list of texts in all_texts. Needs to be called before train! ''' self.raw_texts = all_texts #self.build_sequences() self.fit_tokenizer() if self.use_pretrained_embeddings: self.init_word_vectors() def fit_tokenizer(self): ''' Fits tokenizer to all raw texts; remembers indices->words mappings. ''' self.tokenizer.fit_on_texts(self.raw_texts) self.word_indices_to_words = {} for token, idx in self.tokenizer.word_index.items(): self.word_indices_to_words[idx] = token def build_sequences(self, texts): X = list(self.tokenizer.texts_to_sequences_generator(texts)) X = np.array(pad_sequences(X, maxlen=self.maxlen)) return X def init_word_vectors(self): ''' Initialize word vectors. ''' self.init_vectors = [] unknown_words_to_vecs = {} for t, token_idx in self.tokenizer.word_index.items(): if token_idx <= self.max_features: try: self.init_vectors.append(self.word_embeddings[t]) except: if t not in unknown_words_to_vecs: # randomly initialize unknown_words_to_vecs[t] = np.random.random( self.embedding_dims)*-2 + 1 self.init_vectors.append(unknown_words_to_vecs[t]) # note that we make this a singleton list because that's # what Keras wants. self.init_vectors = [np.vstack(self.init_vectors)]
class Preprocessor: def __init__(self, max_features, max_sent_len, embedding_dims=200, wvs=None, max_doc_len=500): ''' max_features: the upper bound to be placed on the vocabulary size. max_sent_len: the maximum length (in terms of tokens) of the instances/texts. embedding_dims: size of the token embeddings; over-ridden if pre-trained vectors is provided (if wvs is not None). ''' self.max_features = max_features self.tokenizer = Tokenizer(nb_words=self.max_features) self.max_sent_len = max_sent_len # the max sentence length! @TODO rename; this is confusing. self.max_doc_len = max_doc_len # w.r.t. number of sentences! self.use_pretrained_embeddings = False self.init_vectors = None if wvs is None: self.embedding_dims = embedding_dims else: # note that these are only for initialization; # they will be tuned! self.use_pretrained_embeddings = True self.embedding_dims = wvs.vector_size self.word_embeddings = wvs def preprocess(self, all_docs): ''' This fits tokenizer and builds up input vectors (X) from the list of texts in all_texts. Needs to be called before train! ''' self.raw_texts = all_docs #self.build_sequences() self.fit_tokenizer() if self.use_pretrained_embeddings: self.init_word_vectors() def fit_tokenizer(self): ''' Fits tokenizer to all raw texts; remembers indices->words mappings. ''' self.tokenizer.fit_on_texts(self.raw_texts) self.word_indices_to_words = {} for token, idx in self.tokenizer.word_index.items(): self.word_indices_to_words[idx] = token def build_sequences(self, texts): X = list(self.tokenizer.texts_to_sequences_generator(texts)) X = np.array(pad_sequences(X, maxlen=self.max_sent_len)) return X def init_word_vectors(self): ''' Initialize word vectors. ''' self.init_vectors = [] unknown_words_to_vecs = {} for t, token_idx in self.tokenizer.word_index.items(): if token_idx <= self.max_features: try: self.init_vectors.append(self.word_embeddings[t]) except: if t not in unknown_words_to_vecs: # randomly initialize unknown_words_to_vecs[t] = np.random.random( self.embedding_dims) * -2 + 1 self.init_vectors.append(unknown_words_to_vecs[t]) # note that we make this a singleton list because that's # what Keras wants. self.init_vectors = [np.vstack(self.init_vectors)]
OUTPUTPATH = "" # path folder for output data (logdata.npy, loglabel.npy files will be created there) # Create word embeddings # read preprocessed log events eventFile = open(EVENT_TEMPLATE, 'r') Lines = eventFile.readlines() eventList = [] for line in Lines: eventList.append(line.strip()) # create vocab, converts words to token tokenizer = Tokenizer(num_words=1000, lower=True) tokenizer.fit_on_texts(eventList) sequences = tokenizer.texts_to_sequences(eventList) tokenizer.texts_to_sequences_generator(sequences) # read pretrained glove word embeddings wordEmbeddings = dict() gloveFile = open(EMBEDDING, encoding="utf8") for line in gloveFile: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') wordEmbeddings[word] = coefs # create word - embedding dict tokenEmbeddings = dict() for logWord in tokenizer.word_index:
lower=True, char_level=False) source_tokenizer.fit_on_texts(src_texts) target_tokenizer = Tokenizer(num_words=max_vocab_size, lower=True, char_level=False) target_tokenizer.fit_on_texts(tgt_texts) source_vocab_size = len(source_tokenizer.word_index) + 1 target_vocab_size = len(target_tokenizer.word_index) + 1 print "source vocab size: ", source_vocab_size print "target vocab size: ", target_vocab_size max_input_length = max( len(seq) for seq in source_tokenizer.texts_to_sequences_generator(src_texts)) max_output_length = max( len(seq) for seq in source_tokenizer.texts_to_sequences_generator(tgt_texts)) target_reverse_word_index = { v: k for k, v in target_tokenizer.word_index.items() } print "max input length: ", max_input_length print "max_output_length: ", max_output_length seq2seq_params = { 'max_input_length': max_input_length, 'max_output_length': max_output_length, 'source_vocab_size': source_vocab_size,
tgt_val = [' '.join([start_token, unidecode(text), end_token]) for text in tgt_val] src_test = [' '.join([start_token, unidecode(text), end_token]) for text in src_test] tgt_test = [' '.join([start_token, unidecode(text), end_token]) for text in tgt_test] print("tokenizing...") source_tokenizer = Tokenizer(num_words=max_vocab_size, lower=True, char_level=False) source_tokenizer.fit_on_texts(src_texts) target_tokenizer = Tokenizer(num_words=max_vocab_size, lower=True, char_level=False) target_tokenizer.fit_on_texts(tgt_texts) source_vocab_size = len(source_tokenizer.word_index) + 1 target_vocab_size = len(target_tokenizer.word_index) + 1 print("source vocab size: ", source_vocab_size) print("target vocab size: ", target_vocab_size) max_input_length = max(len(seq) for seq in source_tokenizer.texts_to_sequences_generator(src_texts)) max_output_length = max(len(seq) for seq in source_tokenizer.texts_to_sequences_generator(tgt_texts)) target_reverse_word_index = {v:k for k, v in target_tokenizer.word_index.items()} print("max input length: ", max_input_length) print("max_output_length: ", max_output_length) seq2seq_params = { 'max_input_length': max_input_length, 'max_output_length': max_output_length, 'source_vocab_size': source_vocab_size, 'target_vocab_size': target_vocab_size, 'embedding_dim': embedding_dim, 'hidden_dim': hidden_dim }
# just binary classification, # so we want the output to be in [0,1], # and we can use binary crossentropy as our loss model.add(Activation('sigmoid')) model.compile(optimizer='adam', loss='binary_crossentropy') ### n_epochs = 60 # used to sample words (indices) sampling_table = make_sampling_table(vocab_size) for i in range(n_epochs): loss = 0 for seq in tokenizer.texts_to_sequences_generator(text_generator()): # generate skip-gram training examples # - `couples` consists of the pivots (i.e. target words) and surrounding contexts # - `labels` represent if the context is true or not # - `window_size` determines how far to look between words # - `negative_samples` specifies the ratio of negative couples # (i.e. couples where the context is false) # to generate with respect to the positive couples; # i.e. `negative_samples=4` means "generate 4 times as many negative samples" couples, labels = skipgrams(seq, vocab_size, window_size=5, negative_samples=4, sampling_table=sampling_table) if couples: pivot, context = zip(*couples)
print "Tokenizing sentences..." for i, review in enumerate(reviews_texts): print '{} of {}'.format(i, len(reviews_texts)) sentences.append( [x.lower_.encode('ascii', errors='ignore') for x in nlp(review)]) from keras.preprocessing.text import Tokenizer tk = Tokenizer() tk.fit_on_texts((t.encode('ascii', errors='ignore') for t in reviews_texts)) tk.fit_on_texts((' '.join(t) for t in sentences)) seq_data = [ _ for _ in tk.texts_to_sequences_generator(( t.encode('ascii', errors='ignore') for t in reviews_texts)) ] seq_data = [ _ for _ in tk.texts_to_sequences_generator((' '.join(t) for t in sentences)) ] cPickle.dump( { 'funny': funny_votes, 'useful': useful_votes, 'stars': review_stars,