Esempio n. 1
0
 def ch_to_index(self, text, tok=None):
     sequences = []
     if tok is None:
         tokenizer = Tokenizer(lower=False, char_level=True)
         all_of_them = [' '.join(z) for z in text]
         tokenizer.fit_on_texts(all_of_them)
     else:
         tokenizer = tok
     for words in text:
         charaters = []
         for ch in tokenizer.texts_to_sequences_generator(words):
             charaters.append(ch)
         sequences.append(charaters)
     return sequences, tokenizer
Esempio n. 2
0
def test_tokenizer():
    texts = ['The cat sat on the mat.',
             'The dog sat on the log.',
             'Dogs and cats living together.']
    tokenizer = Tokenizer(num_words=10)
    tokenizer.fit_on_texts(texts)

    sequences = []
    for seq in tokenizer.texts_to_sequences_generator(texts):
        sequences.append(seq)
    assert np.max(np.max(sequences)) < 10
    assert np.min(np.min(sequences)) == 1

    tokenizer.fit_on_sequences(sequences)

    for mode in ['binary', 'count', 'tfidf', 'freq']:
        matrix = tokenizer.texts_to_matrix(texts, mode)
Esempio n. 3
0
print "Tokenizing sentences..."
for i, review in enumerate(reviews_texts):
    print '{} of {}'.format(i, len(reviews_texts))
    sentences.append([x.lower_.encode('ascii',errors='ignore') for x in nlp(review)])


from keras.preprocessing.text import Tokenizer

tk = Tokenizer()

tk.fit_on_texts((t.encode('ascii',errors='ignore') for t in reviews_texts))

tk.fit_on_texts((' '.join(t) for t in sentences))


seq_data = [_ for _ in tk.texts_to_sequences_generator((t.encode('ascii',errors='ignore') for t in reviews_texts))]


seq_data = [_ for _ in tk.texts_to_sequences_generator((' '.join(t) for t in sentences))]




cPickle.dump({'funny' : funny_votes, 
'useful' : useful_votes, 
'stars' : review_stars, 
'partition_range' : 'range(1, 20)', 
'sequenced_data' : seq_data, 
'meta' : 'Yelp data over the partitions 1 thru 19. sequenced_data is an embedding from the Keras Tokenizer'}, 
open('data-dump-1-19.pkl', 'wb'), cPickle.HIGHEST_PROTOCOL)
# LSTM
lstm_output_size = 70

#####################set up tokenizer###################3
#generator for tokenizer
def generator_review_parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        review_dict = eval(l)
        yield review_dict['reviewText']

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)

path = './reviews_Electronics_5.json.gz'
tokenizer.fit_on_texts(generator_review_parse(path))
sequences = tokenizer.texts_to_sequences_generator(generator_review_parse(path))
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

################## generator for training neural network #################
def generator_modelData(path,batch_size=1,token_model=tokenizer):
    g = gzip.open(path, 'r')

    #here's the proportion of the ratings (10k samples of electronics)
    cat_props = [0.05293333,0.0406,0.08066667,0.20826667,0.61753333]

    count = 0
    for l in g:
        if count == 0: reviews, scores, sample_weight = [], [], []

        review_dict = eval(l)
class Data(object):
    def __init__(self):
        # DataFrames
        self._df_train = pd.DataFrame()
        self._df_test = pd.DataFrame()

        # the actual data
        self._x_train = np.array([])
        self._x_dev = None
        self._x_test = np.array([])
        self._y_train = np.array([])
        self._y_dev = None
        self._train_ids = []
        self._dev_ids = None
        self._test_ids = []

        # parameters influencing the creation of the data
        self._dev_frac = None
        self._max_len = None
        self._shuffle = None
        self._label_smooth_frac = None

        # Tokenizer and LabelEncoder instances
        self._tokenizer = Tokenizer()
        self._label_encoder = LabelEncoder()

    def prepare_data(
        self,
        train_tsv,
        test_tsv,
        dev_frac=None,
        max_len=200,
        shuffle=True,
        label_smooth_frac=None,
        tokenizer=None,
    ):
        """Prepares training and test data by converting sentences into
        sequences of numbers and labels into categorical representations.

        Args:
            train_tsv:          path to a tsv file containing training
                                    data
            test_tsv:           path to a tsv file containing test data
            dev_frac:           fraction of the training data split off
                                    and used as the development set
            max_len:            maximal sentence lenght all sentences
                                    are padded to
            shuffle:            whether or not to shuffle the training
                                    data
            label_smooth_frac:  fraction of probability space distributed
                                    from the true example to the others
            tokenizer:          an instance of
                                    keras.preprocessing.text.Tokenizer
        """
        self._dev_frac = dev_frac
        self._max_len = max_len
        self._shuffle = shuffle
        self._label_smooth_frac = label_smooth_frac
        if tokenizer:
            self._tokenizer = tokenizer

        self._df_train = self._read_train_tsv(train_tsv, shuffle=self._shuffle)
        x_train, train_ids = self._prepare_data_x_train(self._df_train)
        y_train = self._prepare_data_y(self._df_train)
        self._split_save_train_dev(x_train, y_train, train_ids)

        self._df_test = self._read_test_tsv(test_tsv)
        x_test, test_ids = self._prepare_data_x_test(self._df_test)
        self._save_test(x_test, test_ids)

        return True

    def _read_train_tsv(self, train_tsv, shuffle=True):
        """reads the tsv containing the training data into a DataFrame"""
        df = pd.read_csv(
            train_tsv,
            delimiter="\t",
            names=["id", "sentence", "label"],
            converters={"id": lambda x: str(x)},
        )
        if shuffle:
            df = df.sample(frac=1)
        return df

    def _read_test_tsv(self, test_tsv):
        """reads the tsv containing the test data into a DataFrame"""
        df = pd.read_csv(test_tsv,
                         delimiter="\t",
                         names=["id", "sentence"],
                         converters={"id": lambda x: str(x)})
        return df

    def _prepare_data_x_train(self, df):
        """fits the Tokenizer, converts sentences into sequences of
        equal lenght and returns them as well as their ids"""
        ids = [row["id"] for _, row in df.iterrows()]
        x_sents = [row["sentence"] for _, row in df.iterrows()]
        self._tokenizer.fit_on_texts(x_sents)
        x_sents = [
            sequence for sequence in
            self._tokenizer.texts_to_sequences_generator(x_sents)
        ]
        return pad_sequences(x_sents, maxlen=self._max_len), ids

    def _prepare_data_x_test(self, df):
        """converts sentences into sequences of equal lenght and returns
        them as well as their ids"""
        ids = [row["id"] for _, row in df.iterrows()]
        x_sents = [row["sentence"] for _, row in df.iterrows()]
        x_sents = [
            sequence for sequence in
            self._tokenizer.texts_to_sequences_generator(x_sents)
        ]
        return pad_sequences(x_sents, maxlen=self._max_len), ids

    def _prepare_data_y(self, df):
        """fits the LabelEncoder and converts the labels into categorical
        representations adding label smoothing"""
        y_labels = [row["label"] for _, row in df.iterrows()]
        self._label_encoder.fit(y_labels)
        y_labels = self._label_encoder.transform(y_labels)
        return self._label_smoothing(to_categorical(y_labels),
                                     self._label_smooth_frac)

    def _label_smoothing(self, array, label_smooth_frac):
        """smoothes the numbers in a categorical numpy array row-wise"""
        if label_smooth_frac:
            return array * (1 - label_smooth_frac) + (
                1 - array) * label_smooth_frac / (array.shape[1] - 1)
        else:
            return array

    def _split_save_train_dev(self, x_train, y_train, train_ids):
        """splits the train data into a train and dev set and saves it"""
        if self._dev_frac:
            self._x_train = x_train[int(len(x_train) * self._dev_frac):]
            self._x_dev = x_train[:int(len(x_train) * self._dev_frac)]
            self._y_train = y_train[int(len(y_train) * self._dev_frac):]
            self._y_dev = y_train[:int(len(y_train) * self._dev_frac)]
            self._train_ids = train_ids[int(len(train_ids) * self._dev_frac):]
            self._dev_ids = train_ids[:int(len(train_ids) * self._dev_frac)]
        else:
            self._x_train = x_train
            self._y_train = y_train
            self._train_ids = train_ids
        return True

    def _save_test(self, x_test, test_ids):
        """saves the test set"""
        self._x_test = x_test
        self._test_ids = test_ids
        return True

    def get_df_train(self):
        """returns the DataFrame read from the train tsv"""
        return self._df_train

    def get_df_test(self):
        """returns the DataFrame read from the test tsv"""
        return self._df_test

    def get_train_data(self):
        """returns the training and development data"""
        return (
            (self._x_train, self._y_train, self._train_ids),
            (self._x_dev, self._y_dev, self._dev_ids),
        )

    def get_test_data(self):
        """returns the test data"""
        return (self._x_test, self._test_ids)

    def get_positions_e1(self):
        """returns an array with the position of every token relative to
        entity one. 0 is reserved for padding tokens, 1 is reserved for
        entity one.
        """
        pos_e1_train = []
        pos_e1_dev = []
        pos_e1_test = []
        for sent in self._x_train:
            sent = list(sent)
            beg_index = sent.index(4)
            end_index = sent.index(5)
            new = []
            for i in range(len(sent)):
                if sent[i] == 0:
                    new.append(sent[i])
                elif beg_index <= i <= end_index:
                    new.append(1)
                elif i < beg_index:
                    new.append(i - beg_index)
                elif i > end_index:
                    new.append(i - end_index + 1)
            pos_e1_train.append(new)
        for sent in self._x_test:
            sent = list(sent)
            beg_index = sent.index(4)
            end_index = sent.index(5)
            new = []
            for i in range(len(sent)):
                if sent[i] == 0:
                    new.append(sent[i])
                elif beg_index <= i <= end_index:
                    new.append(1)
                elif i < beg_index:
                    new.append(i - beg_index)
                elif i > end_index:
                    new.append(i - end_index + 1)
            pos_e1_test.append(new)
        if not self._dev_frac:
            return (
                pad_sequences(pos_e1_train, maxlen=self._max_len),
                None,
                pad_sequences(pos_e1_test, maxlen=self._max_len),
            )

        for sent in self._x_dev:
            sent = list(sent)
            beg_index = sent.index(4)
            end_index = sent.index(5)
            new = []
            for i in range(len(sent)):
                if sent[i] == 0:
                    new.append(sent[i])
                elif beg_index <= i <= end_index:
                    new.append(1)
                elif i < beg_index:
                    new.append(i - beg_index)
                elif i > end_index:
                    new.append(i - end_index + 1)
            pos_e1_dev.append(new)
        return (
            pad_sequences(pos_e1_train, maxlen=self._max_len),
            pad_sequences(pos_e1_dev, maxlen=self._max_len),
            pad_sequences(pos_e1_test, maxlen=self._max_len),
        )

    def get_positions_e2(self):
        """returns an array with the position of every token relative to
        entity two. 0 is reserved for padding tokens, 1 is reserved for
        entity two.
        """
        pos_e2_train = []
        pos_e2_dev = []
        pos_e2_test = []
        for sent in self._x_train:
            sent = list(sent)
            beg_index = sent.index(6)
            end_index = sent.index(7)
            new = []
            for i in range(len(sent)):
                if sent[i] == 0:
                    new.append(sent[i])
                elif beg_index <= i <= end_index:
                    new.append(1)
                elif i < beg_index:
                    new.append(i - beg_index)
                elif i > end_index:
                    new.append(i - end_index + 1)
            pos_e2_train.append(new)
        for sent in self._x_test:
            sent = list(sent)
            beg_index = sent.index(6)
            end_index = sent.index(7)
            new = []
            for i in range(len(sent)):
                if sent[i] == 0:
                    new.append(sent[i])
                elif beg_index <= i <= end_index:
                    new.append(1)
                elif i < beg_index:
                    new.append(i - beg_index)
                elif i > end_index:
                    new.append(i - end_index + 1)
            pos_e2_test.append(new)
        if not self._dev_frac:
            return (
                pad_sequences(pos_e2_train, maxlen=self._max_len),
                None,
                pad_sequences(pos_e2_test, maxlen=self._max_len),
            )

        for sent in self._x_dev:
            sent = list(sent)
            beg_index = sent.index(6)
            end_index = sent.index(7)
            new = []
            for i in range(len(sent)):
                if sent[i] == 0:
                    new.append(sent[i])
                elif beg_index <= i <= end_index:
                    new.append(1)
                elif i < beg_index:
                    new.append(i - beg_index)
                elif i > end_index:
                    new.append(i - end_index + 1)
            pos_e2_dev.append(new)
        return (
            pad_sequences(pos_e2_train, maxlen=self._max_len),
            pad_sequences(pos_e2_dev, maxlen=self._max_len),
            pad_sequences(pos_e2_test, maxlen=self._max_len),
        )

    def get_entities_and_context(self):
        """returns an array with only the entities and the context in
        between them. entity markers were removed.
        """
        context_train = []
        context_dev = []
        context_test = []
        for sent in self._x_train:
            sent = list(sent)
            beg_index = sent.index(4)
            end_index = sent.index(7)
            new = []
            for i in range(len(sent)):
                if beg_index < i < end_index and sent[i] not in [5, 6]:
                    new.append(sent[i])
            context_train.append(new)
        for sent in self._x_test:
            sent = list(sent)
            beg_index = sent.index(4)
            end_index = sent.index(7)
            new = []
            for i in range(len(sent)):
                if beg_index < i < end_index and sent[i] not in [5, 6]:
                    new.append(sent[i])
            context_test.append(new)
        if not self._dev_frac:
            return (
                pad_sequences(context_train, maxlen=self._max_len),
                None,
                pad_sequences(context_test, maxlen=self._max_len),
            )

        for sent in self._x_dev:
            sent = list(sent)
            beg_index = sent.index(4)
            end_index = sent.index(7)
            new = []
            for i in range(len(sent)):
                if beg_index < i < end_index and sent[i] not in [5, 6]:
                    new.append(sent[i])
            context_dev.append(new)
        return (
            pad_sequences(context_train, maxlen=self._max_len),
            pad_sequences(context_dev, maxlen=self._max_len),
            pad_sequences(context_test, maxlen=self._max_len),
        )

    def get_dev_frac(self):
        """returns the fraction of the training data split off for the
        development data"""
        return self._dev_frac

    def get_max_len(self):
        """returns the maximal sequence length"""
        return self._max_len

    def get_shuffle(self):
        """returns if the training data have been shuffled"""
        return self._shuffle

    def get_label_smooth_frac(self):
        """returns the fraction of the probability space distributed
        from the true example to the others"""
        return self._label_smooth_frac

    def get_tokenizer(self):
        """returns the Tokenizer"""
        return self._tokenizer

    def get_label_encoder(self):
        """returns the LabelEncoder"""
        return self._label_encoder
Esempio n. 6
0
class Preprocessor:
    def __init__(self, max_features, max_sent_len, embedding_dims=200, wvs=None, 
                    max_doc_len=500, stopword=True):
        '''
        max_features: the upper bound to be placed on the vocabulary size.
        max_sent_len: the maximum length (in terms of tokens) of the instances/texts.
        embedding_dims: size of the token embeddings; over-ridden if pre-trained
                          vectors is provided (if wvs is not None).
        '''

        self.max_features = max_features  
        self.tokenizer = Tokenizer(num_words=self.max_features)#num_words=self.max_features)
        self.max_sent_len = max_sent_len  # the max sentence length! @TODO rename; this is confusing. 
        self.max_doc_len = max_doc_len # w.r.t. number of sentences!

        self.use_pretrained_embeddings = False 
        self.init_vectors = None 
        if wvs is None:
            self.embedding_dims = embedding_dims
        else:
            # note that these are only for initialization;
            # they will be tuned!
            self.use_pretrained_embeddings = True
            # for new gensim format
            self.embedding_dims = wvs.syn0.shape[1] #wvs.vector_size
            self.word_embeddings = wvs

        
        self.stopword = stopword
        # lifted directly from spacy's EN list
        #self.stopwords = [u'all', u'six', u'just', u'less', u'being', u'indeed', u'over', u'move', u'anyway', u'four', u'not', u'own', u'through', u'using', u'fify', u'where', u'mill', u'only', u'find', u'before', u'one', u'whose', u'system', u'how', u'somewhere', u'much', u'thick', u'show', u'had', u'enough', u'should', u'to', u'must', u'whom', u'seeming', u'yourselves', u'under', u'ours', u'two', u'has', u'might', u'thereafter', u'latterly', u'do', u'them', u'his', u'around', u'than', u'get', u'very', u'de', u'none', u'cannot', u'every', u'un', u'they', u'front', u'during', u'thus', u'now', u'him', u'nor', u'name', u'regarding', u'several', u'hereafter', u'did', u'always', u'who', u'didn', u'whither', u'this', u'someone', u'either', u'each', u'become', u'thereupon', u'sometime', u'side', u'towards', u'therein', u'twelve', u'because', u'often', u'ten', u'our', u'doing', u'km', u'eg', u'some', u'back', u'used', u'up', u'go', u'namely', u'computer', u'are', u'further', u'beyond', u'ourselves', u'yet', u'out', u'even', u'will', u'what', u'still', u'for', u'bottom', u'mine', u'since', u'please', u'forty', u'per', u'its', u'everything', u'behind', u'does', u'various', u'above', u'between', u'it', u'neither', u'seemed', u'ever', u'across', u'she', u'somehow', u'be', u'we', u'full', u'never', u'sixty', u'however', u'here', u'otherwise', u'were', u'whereupon', u'nowhere', u'although', u'found', u'alone', u're', u'along', u'quite', u'fifteen', u'by', u'both', u'about', u'last', u'would', u'anything', u'via', u'many', u'could', u'thence', u'put', u'against', u'keep', u'etc', u'amount', u'became', u'ltd', u'hence', u'onto', u'or', u'con', u'among', u'already', u'co', u'afterwards', u'formerly', u'within', u'seems', u'into', u'others', u'while', u'whatever', u'except', u'down', u'hers', u'everyone', u'done', u'least', u'another', u'whoever', u'moreover', u'couldnt', u'throughout', u'anyhow', u'yourself', u'three', u'from', u'her', u'few', u'together', u'top', u'there', u'due', u'been', u'next', u'anyone', u'eleven', u'cry', u'call', u'therefore', u'interest', u'then', u'thru', u'themselves', u'hundred', u'really', u'sincere', u'empty', u'more', u'himself', u'elsewhere', u'mostly', u'on', u'fire', u'am', u'becoming', u'hereby', u'amongst', u'else', u'part', u'everywhere', u'too', u'kg', u'herself', u'former', u'those', u'he', u'me', u'myself', u'made', u'twenty', u'these', u'was', u'bill', u'cant', u'us', u'until', u'besides', u'nevertheless', u'below', u'anywhere', u'nine', u'can', u'whether', u'of', u'your', u'toward', u'my', u'say', u'something', u'and', u'whereafter', u'whenever', u'give', u'almost', u'wherever', u'is', u'describe', u'beforehand', u'herein', u'doesn', u'an', u'as', u'itself', u'at', u'have', u'in', u'seem', u'whence', u'ie', u'any', u'fill', u'again', u'hasnt', u'inc', u'thereby', u'thin', u'no', u'perhaps', u'latter', u'meanwhile', u'when', u'detail', u'same', u'wherein', u'beside', u'also', u'that', u'other', u'take', u'which', u'becomes', u'you', u'if', u'nobody', u'unless', u'whereas', u'see', u'though', u'may', u'after', u'upon', u'most', u'hereupon', u'eight', u'but', u'serious', u'nothing', u'such', u'why', u'off', u'a', u'don', u'whereby', u'third', u'i', u'whole', u'noone', u'sometimes', u'well', u'amoungst', u'yours', u'their', u'rather', u'without', u'so', u'five', u'the', u'first', u'with', u'make', u'once']
        self.stopwords = ["a", "about", "again", "all", "almost", "also", "although", "always", "among", "an", "and", "another", "any", "are", "as", "at", "b", "be", "because", "been", "before", "being", "between", "both", "but", "by", "c", "can", "could", "did", "do", "d", "does", "each", "either", "enough", "etc", "f", "for", "from", "had", "has", "have", "here", "how", "h", "i", "if", "in", "into", "is", "it", "its", "j", "just", "k", "made", "make", "may", "must", "n", "o", "of", "often", "on", "p", "q", "r", "s", "so", "that", "the", "them", "then", "their", "those", "thus", "to", "t", "u", "use", "used", "v", "w", "x", "y", "z", "we", "was"]


    def remove_stopwords(self, texts):
        stopworded_texts = []
        for text in texts: 
            # note the naive segmentation; although this is same as the 
            # keras module does.
            #stopworded_text = " ".join([t for t in text.split(" ") if not t.lower() in self.stopwords])
            stopworded_text = []
            for t in text.split(" "):
                if not t in self.stopwords:
                    if t.isdigit():
                        t = "numbernumbernumber"
                    stopworded_text.append(t)
            #stopworded_text = " ".join([t for t in text.split(" ") if not t in self.stopwords])
            stopworded_text = " ".join(stopworded_text)
            stopworded_texts.append(stopworded_text)
        return stopworded_texts


    def preprocess(self, all_docs):
        ''' 
        This fits tokenizer and builds up input vectors (X) from the list 
        of texts in all_texts. Needs to be called before train!
        '''
        self.raw_texts = all_docs
        if self.stopword:
            #for text in self.raw_texts: 
            self.processed_texts = self.remove_stopwords(self.raw_texts)
        else:
            self.processed_texts = self.raw_texts

        self.fit_tokenizer()
        if self.use_pretrained_embeddings:
            self.init_word_vectors()


    def fit_tokenizer(self):
        ''' Fits tokenizer to all raw texts; remembers indices->words mappings. '''
        self.tokenizer.fit_on_texts(self.processed_texts)
        self.word_indices_to_words = {}
        for token, idx in self.tokenizer.word_index.items():
            self.word_indices_to_words[idx] = token


    def decode(self, x):
        ''' For convenience; map from word index vector to words'''
        words = []
        for t_idx in x:
            if t_idx == 0:
                words.append("pad")
            else: 
                words.append(self.word_indices_to_words[t_idx])
        return " ".join(words) 

    def build_sequences(self, texts, pad_documents=False):
        processed_texts = texts 
        if self.stopword:
            processed_texts = self.remove_stopwords(texts)

        X = list(self.tokenizer.texts_to_sequences_generator(processed_texts))

        # need to pad the number of sentences, too.
        X = np.array(pad_sequences(X, maxlen=self.max_sent_len))

        return X

    def init_word_vectors(self):
        ''' 
        Initialize word vectors.
        '''
        self.init_vectors = []
        unknown_words_to_vecs = {}
        for t, token_idx in self.tokenizer.word_index.items():
            if token_idx <= self.max_features:
                try:
                    self.init_vectors.append(self.word_embeddings[t])
                except:
                    if t not in unknown_words_to_vecs:
                        # randomly initialize
                        unknown_words_to_vecs[t] = np.random.random(
                                                self.embedding_dims)*-2 + 1

                    self.init_vectors.append(unknown_words_to_vecs[t])

        # init padding token!
        self.init_vectors.append(np.zeros(self.embedding_dims))

        # note that we make this a singleton list because that's
        # what Keras wants. 
        self.init_vectors = [np.vstack(self.init_vectors)]
Esempio n. 7
0
class Preprocessor:
    # @TODO setting max_CUI_size to something small for now!
    def __init__(self, max_vocab_size=10000, max_CUI_size=5000, 
                    max_len=40, max_CUI_len=100, 
                    wv_embedding_dims=200, CUI_embedding_dims=200,
                    wvs=None, CUI_vs=None):
        '''
        max_vocab_size: maximum number of words to include in the model
        max_CUI_size: maximum number of CUIs to include in the model
        max_features: the upper bound to be placed on the vocabulary size.
        max_len: the maximum length (in terms of tokens) of the text snippets.
        max_CUI_size: the maximum number of ancestral CUIs to be used for each instance.
        wv_embedding_dims: size of the token embeddings; over-ridden if pre-trained
                          vectors are provided (if wvs is not None).
        CUI_embedding_dims: size of the CUI embeddings; over-ridden if pre-trained
                          vectors are provided. 
        wvs: pre-trained embeddings (for embeddings initialization)
        '''

        # inputs
        self.max_vocab_size = max_vocab_size
        self.max_CUI_size   = max_CUI_size

        self.max_len = max_len 
        self.max_CUI_len = max_CUI_len 

        self.tokenizer = Tokenizer(nb_words=self.max_vocab_size)
        # overkill to use a tokenizer, but we'll do it anyway
        self.CUI_tokenizer = Tokenizer(nb_words=self.max_CUI_size)

        self.use_pretrained_embeddings = False
        self.init_vectors = None 
        if wvs is None:
            self.wv_embedding_dims = wv_embedding_dims
            self.CUI_embedding_dims = CUI_embedding_dims
        else:
            # note that these are only for initialization;
            # they will be tuned!
            self.use_pretrained_embeddings = True

            self.word_embeddings = wvs
            self.wv_embedding_dims = wvs.vector_size

            self.CUI_embeddings = CUI_vs
            self.CUI_embedding_dims = CUI_vs.vector_size


    def preprocess(self, all_texts, all_CUIs):
        ''' 
        This fits tokenizer and builds up input vectors (X) from the list 
        of texts in all_texts. Needs to be called before train!
        '''
        self.raw_texts = all_texts
        self.CUIs = all_CUIs
        self.fit_tokenizer()
        self.fit_CUI_tokenizer()
        if self.use_pretrained_embeddings:
            print "initializing word vectors.."
            self.init_word_vectors()
            print "done. initializing CUI vectors..."
            self.init_CUI_vectors()
            print "done."


    def fit_tokenizer(self):
        ''' Fits tokenizer to all raw texts; remembers indices->words mappings. '''
        self.tokenizer.fit_on_texts(self.raw_texts)
        self.word_indices_to_words = {}
        for token, idx in self.tokenizer.word_index.items():
            self.word_indices_to_words[idx] = token

    def fit_CUI_tokenizer(self):
        ''' Fits tokenizer to all raw texts; remembers indices->words mappings. '''
        self.CUI_tokenizer.fit_on_texts(self.CUIs)
        self.CUI_indices_to_CUIs = {}
        for CUI, idx in self.CUI_tokenizer.word_index.items():
            self.CUI_indices_to_CUIs[idx] = CUI

    def build_text_sequences(self, texts):
        X = list(self.tokenizer.texts_to_sequences_generator(texts))
        X = np.array(pad_sequences(X, maxlen=self.max_len))
        return X

    def build_CUI_sequences(self, CUIs):
        X_CUIs = list(self.CUI_tokenizer.texts_to_sequences_generator(CUIs))
        X_CUIs = np.array(pad_sequences(X_CUIs, maxlen=self.max_CUI_len))
        return X_CUIs

    def init_word_vectors(self):
        ''' 
        Initialize word vectors.
        '''
        self.init_word_vectors = []
        unknown_words_to_vecs = {}
        for t, token_idx in self.tokenizer.word_index.items():
            if token_idx <= self.max_vocab_size:
                try:
                    self.init_word_vectors.append(self.word_embeddings[t])
                except:
                    if t not in unknown_words_to_vecs:
                        # randomly initialize
                        unknown_words_to_vecs[t] = np.random.random(
                                                self.wv_embedding_dims)*-2 + 1

                    self.init_word_vectors.append(unknown_words_to_vecs[t])

        # note that we make this a singleton list because that's
        # what Keras wants. 
        self.init_word_vectors = [np.vstack(self.init_word_vectors)]


    def init_CUI_vectors(self): 
        '''
        initialize CUI vectors 
        '''
        self.init_CUI_vectors = []
        unknown_CUIs_to_vecs = {}
        for CUI, CUI_idx in self.CUI_tokenizer.word_index.items():
            if CUI_idx <= self.max_CUI_size:
                try: 
                    self.init_CUI_vectors.append(self.CUI_embeddings[CUI])
                except:
                    if CUI not in unknown_CUIs_to_vecs:
                        unknown_CUIs_to_vecs[CUI] = np.random.random(self.CUI_embedding_dims)*-2 + 1
                    self.init_CUI_vectors.append(unknown_CUIs_to_vecs[CUI])

        self.init_CUI_vectors = [np.vstack(self.init_CUI_vectors)]
Esempio n. 8
0
class ISummarizer:

    # 100000
    def __init__(self, pairs, nb_words=10000, hidden_size=512, max_input_size=3000, max_output_size=15):
        self.pairs = pairs 
        self.nb_words = nb_words + 2 # number of words; +2 for start and stop tokens!
        self.max_input_size = max_input_size
        self.max_output_size = max_output_size + 2 # again +2 for start/stop

        self.hidden_size = hidden_size
        print("loading pre-trained word vectors...")
        self.wv = load_trained_w2v_model()
        # here you want to add start and stop 
        print("OK!")
        self.word_embedding_size = self.wv.vector_size 

        # call to sequences
        # call init_word_vectors
        print("building sequences...")
        self.build_sequences()

        print("initializing word vectors...")
        self.init_word_vectors()

        print("ok!")


    def build_sequences(self):
        self.tokenizer = Tokenizer(nb_words=self.nb_words)

        self.raw_input_texts  = [START_STR + " " + " ".join(pair[0]) + " " + STOP_STR for pair in self.pairs]
        self.raw_output_texts = [START_STR + " " + " ".join(pair[1]) + " " + STOP_STR for pair in self.pairs]

        def _get_max(seqs):
            return max([len(seq) for seq in seqs])

        self.tokenizer.fit_on_texts(self.raw_input_texts+self.raw_output_texts)
        self.word_indices_to_words = {}
        for token, idx in self.tokenizer.word_index.items():
            self.word_indices_to_words[idx] = token

        self.input_sequences  = list(self.tokenizer.texts_to_sequences_generator(self.raw_input_texts))
        #self.max_input_len    = _get_max(self.input_sequences)
        #X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
        #X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
        self.input_sequences = list(pad_sequences(self.input_sequences, maxlen=self.max_input_size))

        self.output_sequences = list(self.tokenizer.texts_to_sequences_generator(self.raw_output_texts))
        self.output_sequences = list(pad_sequences(self.output_sequences, maxlen=self.max_output_size))
        
    def init_word_vectors(self):
        self.init_vectors = []
        unknown_words_to_vecs = {}
        for t, token_idx in self.tokenizer.word_index.items():
            if token_idx <= self.nb_words:
                try:
                    self.init_vectors.append(self.wv[t])
                except:
                    if t not in unknown_words_to_vecs:
                        # randomly initialize
                        unknown_words_to_vecs[t] = np.random.random(
                                                self.word_embedding_size)*-2 + 1

                    self.init_vectors.append(unknown_words_to_vecs[t])

        self.init_vectors = np.vstack(self.init_vectors)

    def build_model(self):
        self.model = Sequential()
        self.model.add(Embedding(self.nb_words, self.word_embedding_size, weights=[self.init_vectors]))
        ### 
        # run embeddings through a Gated Recurrent Unit
        self.model.add(GRU(self.hidden_size))
        #self.model.add(Dropout(0.1))
        self.model.add(Dense(self.hidden_size))
        self.model.add(Activation('relu'))
        self.model.add(RepeatVector(self.max_output_size))
        self.model.add(GRU(self.hidden_size, return_sequences=True))
        self.model.add(Dropout(0.1))
        self.model.add(TimeDistributedDense(self.nb_words, activation="softmax"))
        # does cross entropy make sense here?
        self.model.compile(loss="categorical_crossentropy", optimizer='adam')
        return self.model 

    def X_y(self):
        self.X = np.array(self.input_sequences) # np.zeros((n, self.max_input_size, self.nb_words), dtype=np.bool)
        self.Y = np.zeros((len(self.output_sequences), self.max_output_size, self.nb_words), dtype=np.bool)
        for i in range(self.X.shape[0]):
            #for j, token_idx in enumerate(self.input_sequences[i]):
            #    self.X[i, j, token_idx] = 1

            for j, token_idx in enumerate(self.output_sequences[i]):
                self.Y[i, j, token_idx] = 1

        print "X shape: %s; Y shape: %s" % (self.X.shape, self.Y.shape)

    def decode(self, pred):
        text = []
        for token_preds in pred: 
            ### it keeps predicting zeros! zeros are for the padding... 
            cur_pred_index = np.argmax(token_preds) #+ 1 # the tokenizer seems to do 1-indexing!
            
            if cur_pred_index == 0:
                text.append("<pad>")
            else:
                text.append(self.word_indices_to_words[cur_pred_index])
        return text
        

    def train(self):
        # @TODO revisit; batchsize, etc
        print "fitting model..."
        self.model.fit(self.X, self.Y)
Esempio n. 9
0
class ISummarizer:

    # 100000
    def __init__(self,
                 pairs,
                 nb_words=10000,
                 hidden_size=512,
                 max_input_size=3000,
                 max_output_size=15):
        self.pairs = pairs
        self.nb_words = nb_words + 2  # number of words; +2 for start and stop tokens!
        self.max_input_size = max_input_size
        self.max_output_size = max_output_size + 2  # again +2 for start/stop

        self.hidden_size = hidden_size
        print("loading pre-trained word vectors...")
        self.wv = load_trained_w2v_model()
        # here you want to add start and stop
        print("OK!")
        self.word_embedding_size = self.wv.vector_size

        # call to sequences
        # call init_word_vectors
        print("building sequences...")
        self.build_sequences()

        print("initializing word vectors...")
        self.init_word_vectors()

        print("ok!")

    def build_sequences(self):
        self.tokenizer = Tokenizer(nb_words=self.nb_words)

        self.raw_input_texts = [
            START_STR + " " + " ".join(pair[0]) + " " + STOP_STR
            for pair in self.pairs
        ]
        self.raw_output_texts = [
            START_STR + " " + " ".join(pair[1]) + " " + STOP_STR
            for pair in self.pairs
        ]

        def _get_max(seqs):
            return max([len(seq) for seq in seqs])

        self.tokenizer.fit_on_texts(self.raw_input_texts +
                                    self.raw_output_texts)
        self.word_indices_to_words = {}
        for token, idx in self.tokenizer.word_index.items():
            self.word_indices_to_words[idx] = token

        self.input_sequences = list(
            self.tokenizer.texts_to_sequences_generator(self.raw_input_texts))
        #self.max_input_len    = _get_max(self.input_sequences)
        #X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
        #X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
        self.input_sequences = list(
            pad_sequences(self.input_sequences, maxlen=self.max_input_size))

        self.output_sequences = list(
            self.tokenizer.texts_to_sequences_generator(self.raw_output_texts))
        self.output_sequences = list(
            pad_sequences(self.output_sequences, maxlen=self.max_output_size))

    def init_word_vectors(self):
        self.init_vectors = []
        unknown_words_to_vecs = {}
        for t, token_idx in self.tokenizer.word_index.items():
            if token_idx <= self.nb_words:
                try:
                    self.init_vectors.append(self.wv[t])
                except:
                    if t not in unknown_words_to_vecs:
                        # randomly initialize
                        unknown_words_to_vecs[t] = np.random.random(
                            self.word_embedding_size) * -2 + 1

                    self.init_vectors.append(unknown_words_to_vecs[t])

        self.init_vectors = np.vstack(self.init_vectors)

    def build_model(self):
        self.model = Sequential()
        self.model.add(
            Embedding(self.nb_words,
                      self.word_embedding_size,
                      weights=[self.init_vectors]))
        ###
        # run embeddings through a Gated Recurrent Unit
        self.model.add(GRU(self.hidden_size))
        #self.model.add(Dropout(0.1))
        self.model.add(Dense(self.hidden_size))
        self.model.add(Activation('relu'))
        self.model.add(RepeatVector(self.max_output_size))
        self.model.add(GRU(self.hidden_size, return_sequences=True))
        self.model.add(Dropout(0.1))
        self.model.add(
            TimeDistributedDense(self.nb_words, activation="softmax"))
        # does cross entropy make sense here?
        self.model.compile(loss="categorical_crossentropy", optimizer='adam')
        return self.model

    def X_y(self):
        self.X = np.array(
            self.input_sequences
        )  # np.zeros((n, self.max_input_size, self.nb_words), dtype=np.bool)
        self.Y = np.zeros(
            (len(self.output_sequences), self.max_output_size, self.nb_words),
            dtype=np.bool)
        for i in range(self.X.shape[0]):
            #for j, token_idx in enumerate(self.input_sequences[i]):
            #    self.X[i, j, token_idx] = 1

            for j, token_idx in enumerate(self.output_sequences[i]):
                self.Y[i, j, token_idx] = 1

        print "X shape: %s; Y shape: %s" % (self.X.shape, self.Y.shape)

    def decode(self, pred):
        text = []
        for token_preds in pred:
            ### it keeps predicting zeros! zeros are for the padding...
            cur_pred_index = np.argmax(
                token_preds)  #+ 1 # the tokenizer seems to do 1-indexing!

            if cur_pred_index == 0:
                text.append("<pad>")
            else:
                text.append(self.word_indices_to_words[cur_pred_index])
        return text

    def train(self):
        # @TODO revisit; batchsize, etc
        print "fitting model..."
        self.model.fit(self.X, self.Y)
class Preprocessor:
    def __init__(self, max_features, maxlen, embedding_dims=200, wvs=None):
        '''
        max_features: the upper bound to be placed on the vocabulary size.
        maxlen: the maximum length (in terms of tokens) of the instances/texts.
        embedding_dims: size of the token embeddings; over-ridden if pre-trained
                          vectors is provided (if wvs is not None).
        '''

        self.max_features = max_features  
        self.tokenizer = Tokenizer(nb_words=self.max_features)
        self.maxlen = maxlen  

        self.use_pretrained_embeddings = False 
        self.init_vectors = None 
        if wvs is None:
            self.embedding_dims = embedding_dims
        else:
            # note that these are only for initialization;
            # they will be tuned!
            self.use_pretrained_embeddings = True
            self.embedding_dims = wvs.vector_size
            self.word_embeddings = wvs


    def preprocess(self, all_texts):
        ''' 
        This fits tokenizer and builds up input vectors (X) from the list 
        of texts in all_texts. Needs to be called before train!
        '''
        self.raw_texts = all_texts
        #self.build_sequences()
        self.fit_tokenizer()
        if self.use_pretrained_embeddings:
            self.init_word_vectors()

    def fit_tokenizer(self):
        ''' Fits tokenizer to all raw texts; remembers indices->words mappings. '''
        self.tokenizer.fit_on_texts(self.raw_texts)
        self.word_indices_to_words = {}
        for token, idx in self.tokenizer.word_index.items():
            self.word_indices_to_words[idx] = token

    def build_sequences(self, texts):
        X = list(self.tokenizer.texts_to_sequences_generator(texts))
        X = np.array(pad_sequences(X, maxlen=self.maxlen))
        return X

    def init_word_vectors(self):
        ''' 
        Initialize word vectors.
        '''
        self.init_vectors = []
        unknown_words_to_vecs = {}
        for t, token_idx in self.tokenizer.word_index.items():
            if token_idx <= self.max_features:
                try:
                    self.init_vectors.append(self.word_embeddings[t])
                except:
                    if t not in unknown_words_to_vecs:
                        # randomly initialize
                        unknown_words_to_vecs[t] = np.random.random(
                                                self.embedding_dims)*-2 + 1

                    self.init_vectors.append(unknown_words_to_vecs[t])

        # note that we make this a singleton list because that's
        # what Keras wants. 
        self.init_vectors = [np.vstack(self.init_vectors)]
class Preprocessor:
    def __init__(self,
                 max_features,
                 max_sent_len,
                 embedding_dims=200,
                 wvs=None,
                 max_doc_len=500):
        '''
        max_features: the upper bound to be placed on the vocabulary size.
        max_sent_len: the maximum length (in terms of tokens) of the instances/texts.
        embedding_dims: size of the token embeddings; over-ridden if pre-trained
                          vectors is provided (if wvs is not None).
        '''

        self.max_features = max_features
        self.tokenizer = Tokenizer(nb_words=self.max_features)
        self.max_sent_len = max_sent_len  # the max sentence length! @TODO rename; this is confusing.
        self.max_doc_len = max_doc_len  # w.r.t. number of sentences!

        self.use_pretrained_embeddings = False
        self.init_vectors = None
        if wvs is None:
            self.embedding_dims = embedding_dims
        else:
            # note that these are only for initialization;
            # they will be tuned!
            self.use_pretrained_embeddings = True
            self.embedding_dims = wvs.vector_size
            self.word_embeddings = wvs

    def preprocess(self, all_docs):
        ''' 
        This fits tokenizer and builds up input vectors (X) from the list 
        of texts in all_texts. Needs to be called before train!
        '''
        self.raw_texts = all_docs
        #self.build_sequences()
        self.fit_tokenizer()
        if self.use_pretrained_embeddings:
            self.init_word_vectors()

    def fit_tokenizer(self):
        ''' Fits tokenizer to all raw texts; remembers indices->words mappings. '''
        self.tokenizer.fit_on_texts(self.raw_texts)
        self.word_indices_to_words = {}
        for token, idx in self.tokenizer.word_index.items():
            self.word_indices_to_words[idx] = token

    def build_sequences(self, texts):
        X = list(self.tokenizer.texts_to_sequences_generator(texts))
        X = np.array(pad_sequences(X, maxlen=self.max_sent_len))
        return X

    def init_word_vectors(self):
        ''' 
        Initialize word vectors.
        '''
        self.init_vectors = []
        unknown_words_to_vecs = {}
        for t, token_idx in self.tokenizer.word_index.items():
            if token_idx <= self.max_features:
                try:
                    self.init_vectors.append(self.word_embeddings[t])
                except:
                    if t not in unknown_words_to_vecs:
                        # randomly initialize
                        unknown_words_to_vecs[t] = np.random.random(
                            self.embedding_dims) * -2 + 1

                    self.init_vectors.append(unknown_words_to_vecs[t])

        # note that we make this a singleton list because that's
        # what Keras wants.
        self.init_vectors = [np.vstack(self.init_vectors)]
Esempio n. 12
0
OUTPUTPATH = ""  # path folder for output data (logdata.npy, loglabel.npy files will be created there)

# Create word embeddings

# read preprocessed log events
eventFile = open(EVENT_TEMPLATE, 'r')
Lines = eventFile.readlines()
eventList = []
for line in Lines:
    eventList.append(line.strip())

# create vocab, converts words to token
tokenizer = Tokenizer(num_words=1000, lower=True)
tokenizer.fit_on_texts(eventList)
sequences = tokenizer.texts_to_sequences(eventList)
tokenizer.texts_to_sequences_generator(sequences)

# read pretrained glove word embeddings
wordEmbeddings = dict()
gloveFile = open(EMBEDDING, encoding="utf8")

for line in gloveFile:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    wordEmbeddings[word] = coefs

# create word - embedding dict
tokenEmbeddings = dict()

for logWord in tokenizer.word_index:
Esempio n. 13
0
                             lower=True,
                             char_level=False)
source_tokenizer.fit_on_texts(src_texts)
target_tokenizer = Tokenizer(num_words=max_vocab_size,
                             lower=True,
                             char_level=False)
target_tokenizer.fit_on_texts(tgt_texts)

source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1
print "source vocab size: ", source_vocab_size
print "target vocab size: ", target_vocab_size

max_input_length = max(
    len(seq)
    for seq in source_tokenizer.texts_to_sequences_generator(src_texts))
max_output_length = max(
    len(seq)
    for seq in source_tokenizer.texts_to_sequences_generator(tgt_texts))
target_reverse_word_index = {
    v: k
    for k, v in target_tokenizer.word_index.items()
}

print "max input length: ", max_input_length
print "max_output_length: ", max_output_length

seq2seq_params = {
    'max_input_length': max_input_length,
    'max_output_length': max_output_length,
    'source_vocab_size': source_vocab_size,
Esempio n. 14
0
tgt_val   = [' '.join([start_token, unidecode(text), end_token]) for text in tgt_val]
src_test  = [' '.join([start_token, unidecode(text), end_token]) for text in src_test]
tgt_test  = [' '.join([start_token, unidecode(text), end_token]) for text in tgt_test]

print("tokenizing...")
source_tokenizer = Tokenizer(num_words=max_vocab_size, lower=True, char_level=False)
source_tokenizer.fit_on_texts(src_texts)
target_tokenizer = Tokenizer(num_words=max_vocab_size, lower=True, char_level=False)
target_tokenizer.fit_on_texts(tgt_texts)

source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1
print("source vocab size: ", source_vocab_size)
print("target vocab size: ", target_vocab_size)

max_input_length = max(len(seq) for seq in source_tokenizer.texts_to_sequences_generator(src_texts))
max_output_length = max(len(seq) for seq in source_tokenizer.texts_to_sequences_generator(tgt_texts))
target_reverse_word_index = {v:k for k, v in target_tokenizer.word_index.items()}

print("max input length: ", max_input_length)
print("max_output_length: ", max_output_length)

seq2seq_params = {
    'max_input_length':  max_input_length, 
    'max_output_length': max_output_length,
    'source_vocab_size': source_vocab_size,
    'target_vocab_size': target_vocab_size,
    'embedding_dim':     embedding_dim,
    'hidden_dim':        hidden_dim
}
Esempio n. 15
0
# just binary classification,
# so we want the output to be in [0,1],
# and we can use binary crossentropy as our loss
model.add(Activation('sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy')

###

n_epochs = 60

# used to sample words (indices)
sampling_table = make_sampling_table(vocab_size)

for i in range(n_epochs):
    loss = 0
    for seq in tokenizer.texts_to_sequences_generator(text_generator()):
        # generate skip-gram training examples
        # - `couples` consists of the pivots (i.e. target words) and surrounding contexts
        # - `labels` represent if the context is true or not
        # - `window_size` determines how far to look between words
        # - `negative_samples` specifies the ratio of negative couples
        #    (i.e. couples where the context is false)
        #    to generate with respect to the positive couples;
        #    i.e. `negative_samples=4` means "generate 4 times as many negative samples"
        couples, labels = skipgrams(seq,
                                    vocab_size,
                                    window_size=5,
                                    negative_samples=4,
                                    sampling_table=sampling_table)
        if couples:
            pivot, context = zip(*couples)
Esempio n. 16
0
print "Tokenizing sentences..."
for i, review in enumerate(reviews_texts):
    print '{} of {}'.format(i, len(reviews_texts))
    sentences.append(
        [x.lower_.encode('ascii', errors='ignore') for x in nlp(review)])

from keras.preprocessing.text import Tokenizer

tk = Tokenizer()

tk.fit_on_texts((t.encode('ascii', errors='ignore') for t in reviews_texts))

tk.fit_on_texts((' '.join(t) for t in sentences))

seq_data = [
    _ for _ in tk.texts_to_sequences_generator((
        t.encode('ascii', errors='ignore') for t in reviews_texts))
]

seq_data = [
    _
    for _ in tk.texts_to_sequences_generator((' '.join(t) for t in sentences))
]

cPickle.dump(
    {
        'funny':
        funny_votes,
        'useful':
        useful_votes,
        'stars':
        review_stars,