Esempio n. 1
0
 def read_copus_generator(self, batch_size=64):
     """ return a generator with the specified batch_size
     """
     logger.info("Beigin read copus {0}".format(file_name))
     data = []
     index = 0
     with open(file_name, 'r') as fread:
         while True:
             try:
                 line = fread.readline()
                 data.append(line)
                 index += 1
                 if index % 100000 == 0:
                     logger.info("The program has processed {0} lines ".
                                 format(index))
             except:
                 logger.info("Read End")
                 break
     tokenizer = Tokenizer(nb_words=30000)
     tokenizer.fit_on_texts(data)
     logger.info("word num: {0}".format(len(tokenizer.word_counts)))
     sorted_word_counts = sorted(
         tokenizer.word_counts.items(),
         key=operator.itemgetter(1),
         reverse=True)
     # save the word_counts to the meta
     with open(file_name.replace("train.", "meta."), "w") as fwrite:
         for word_cnt in sorted_word_counts:
             key = word_cnt[0]
             val = word_cnt[1]
             line = key + ":" + str(val) + "\n"
             fwrite.write(line)
     vectorize_data = tokenizer.texts_to_matrix(data)
     return vectorize_data
Esempio n. 2
0
class Featurizer:

    max_words = None
    tokenizer = None

    def __init__(self, max_words=1000):
        self.max_words = max_words
        self.tokenizer = Tokenizer(num_words=max_words)

    def fit_transform(self, data):
        texts = [l['text'] for l in data]
        self.tokenizer.fit_on_texts(texts)
        # remove words that cross the max_words limit
        self.tokenizer.word_index = {k: v for k, v in self.tokenizer.word_index.items() if v <= self.max_words}
        return self.transform(data)

    def transform(self, data):
        texts = [l['text'] for l in data]
        return self.tokenizer.texts_to_matrix(texts, mode='binary')

    def transform_inv(self, m):
        index = {v: k for k, v in self.tokenizer.word_index.items()} # word index by id
        return [[index.get(i) for i in np.nonzero(line)[0] if i in index] for line in m]

    def save(self, filepath):
        with open(filepath + '_word_index.json', 'w') as f:
            f.write(json.dumps(self.tokenizer.word_index))

    @classmethod
    def load(cls, filepath):
        with open(filepath + '_word_index.json', 'r') as f:
            word_index = json.load(f)
            c = cls(max_words=len(word_index))
            c.tokenizer.word_index = word_index
            return c
Esempio n. 3
0
def test_sequential_fit():
    texts = ['The cat sat on the mat.',
             'The dog sat on the log.',
             'Dogs and cats living together.']
    word_sequences = [
        ['The', 'cat', 'is', 'sitting'],
        ['The', 'dog', 'is', 'standing']
    ]

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    tokenizer.fit_on_texts(word_sequences)

    assert tokenizer.document_count == 5

    tokenizer.texts_to_matrix(texts)
    tokenizer.texts_to_matrix(word_sequences)
Esempio n. 4
0
    def transform(self, dataset=None):
        """ Transform data into vector and matrices. """
        clean = lambda words: [str(word)
                               for word in words
                               if type(word) is not float]

        x_unlabel = clean(dataset.unlabel)
        x_train = clean(dataset.train.X)
        x_test = clean(dataset.test.X)

        y_train = dataset.train.y
        y_test = dataset.test.y

        tokenizer = Tokenizer(nb_words=self.max_words)
        tokenizer.fit_on_texts(x_unlabel)

        # save the list of words in the vocabulary
        self.vocabulary = tokenizer.word_counts

        X_unlabel = tokenizer.texts_to_matrix(x_unlabel, mode=self.mode)
        X_unlabel = pad_sequences(X_unlabel, maxlen=self.max_len,
                                  dtype='float64')

        X_train = tokenizer.texts_to_matrix(x_train, mode=self.mode)
        X_train = pad_sequences(X_train, maxlen=self.max_len, dtype='float64')

        X_test = tokenizer.texts_to_matrix(x_test, mode=self.mode)
        X_test = pad_sequences(X_test, maxlen=self.max_len, dtype='float64')

        y_train = np.asarray(y_train, dtype='int32')
        y_test = np.asarray(y_test, dtype='int32')

        Y_train = np_utils.to_categorical(y_train, self.classes)
        Y_test = np_utils.to_categorical(y_test, self.classes)

        return Dataset(
            X_unlabel,
            Data(X_train, Y_train, y_train),
            Data(X_test, Y_test, y_test),
        )
Esempio n. 5
0
def test_tokenizer():
    texts = ['The cat sat on the mat.',
             'The dog sat on the log.',
             'Dogs and cats living together.']
    tokenizer = Tokenizer(num_words=10)
    tokenizer.fit_on_texts(texts)

    sequences = []
    for seq in tokenizer.texts_to_sequences_generator(texts):
        sequences.append(seq)
    assert np.max(np.max(sequences)) < 10
    assert np.min(np.min(sequences)) == 1

    tokenizer.fit_on_sequences(sequences)

    for mode in ['binary', 'count', 'tfidf', 'freq']:
        matrix = tokenizer.texts_to_matrix(texts, mode)
# lets take 80% data as training and remaining 20% for test.
train_size = int(len(data) * .8)

train_posts = data['news'][:train_size]
train_tags = data['category'][:train_size]
train_files_names = data['filename'][:train_size]

test_posts = data['news'][train_size:]
test_tags = data['category'][train_size:]
test_files_names = data['filename'][train_size:]

# define Tokenizer with Vocab Size
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_posts)

x_train = tokenizer.texts_to_matrix(train_posts, mode='tfidf')
x_test = tokenizer.texts_to_matrix(test_posts, mode='tfidf')

encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(num_labels))
tokenizer.fit_on_texts(Y + X)

print("size X:", len(X))
print("size Y:", len(Y))

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

print(len(X_train), 'train sentences')
print(len(X_test), 'test sentences')
print(len(Y_train), 'train classes')
print(len(Y_test), 'test classes')

c = Counter(Y_train)
print(c.items())

X_train = tokenizer.texts_to_matrix(X_train, mode='binary')
X_test = tokenizer.texts_to_matrix(X_test, mode='binary')

Y_train = tokenizer.texts_to_sequences(Y_train)
Y_test = tokenizer.texts_to_sequences(Y_test)

Y_train_new = []
Y_test_new = []
for y in Y_train:
    Y_train_new.append(y[0])
for y in Y_test:
    Y_test_new.append(y[0])

Y_train = Y_train_new
Y_test = Y_test_new
from keras.preprocessing.text import hashing_trick
##################################################################
## 1. text_to_word_sequence, one_hot, hashing_trick
texts = ['some thing to eat', 'some thing to drink']
print(text_to_word_sequence(texts[0]))  # ['some', 'thing', 'to', 'eat']; 简单的空格分开
print(one_hot(texts[0], 10))  # [5, 7, 5, 7]; (10 表示数字化向量为 10 以内的数字)
print(one_hot(texts[1], 10))  # [5, 7, 5, 5]; 因为内部调用了 hash, 所以能够在定了 (text, n) 之后对每个 str 赋值相同
# This is a wrapper to the `hashing_trick` function using `hash` as the hashing function, unicity of word to index mapping non-guaranteed.
##################################################################
## 2. Tokenizer: 索引就是出现的先后位置
# keras.preprocessing.text.Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n', lower=True, split=" ", char_level=False)
# Tokenizer 是一个用于向量化文本, 或将文本转换为序列(即单词在字典中的下标构成的列表, 从 1 算起)的类.
# num_words: None 或整数, 处理的最大单词数量. 若被设置为整数, 则分词器将被限制为待处理数据集中最常见的 num_words 个单词
# char_level: 如果为 True, 每个字符将被视为一个标记
texts = ['some thing to eat', 'some thing to drink']
tmp_tokenizer = Tokenizer(num_words=None)  # num_words:None 或整数, 处理的最大单词数量; 少于此数的单词丢掉
tmp_tokenizer.fit_on_texts(texts)
# tmp_tokenizer.fit_on_texts(texts[0]); tmp_tokenizer.fit_on_texts(texts[1])  # 不能这样, 会按单个字母来统计
# 属性
print(tmp_tokenizer.word_counts)  # OrderedDict([('some', 2), ('thing', 2), ('to', 2), ('eat', 1), ('drink', 1)]); 在训练期间出现的次数
print(tmp_tokenizer.word_docs)  # {'thing': 2, 'eat': 1, 'to': 2, 'some': 2, 'drink': 1}; 在训练期间所出现的文档或文本的数量
print(tmp_tokenizer.word_index)  # {'some': 1, 'thing': 2, 'to': 3, 'eat': 4, 'drink': 5}; 排名或者索引
print(len(tmp_tokenizer.word_index))  # 5; 词典长度
print(tmp_tokenizer.index_docs)  # {2: 2, 4: 1, 3: 2, 1: 2, 5: 1}; 将 word_index 和 word_docs 合并
print(tmp_tokenizer.document_count)  # 2; 训练文档数
# 类方法
print(tmp_tokenizer.texts_to_sequences(texts))  # [[1, 2, 3, 4], [1, 2, 3, 5]]; 得到词索引
print(tmp_tokenizer.texts_to_matrix(texts))  # 矩阵化 = one_hot; one-hot 形式的码, 即仅记录词在词典中的下标
# [[ 0.  1.  1.  1.  1.  0.]
#  [ 0.  1.  1.  1.  0.  1.]]
Esempio n. 9
0
def get_word_features(emails,verbose=True,nb_words=5000,skip_top=0,maxlen=None,as_matrix=True, matrix_type='count', label_cutoff=0.01,max_n=1):
    (totalWordsCount,fromCount,domainCount,labels) = getEmailStats(emails)
    if verbose:
        print('Creating email dataset with labels %s '%str(labels))
        print('Label word breakdown:')
        total = 0
        for label in labels:
            count = sum(totalWordsCount[label].values())
            total+=count
            print('\t%s:%d'%(label,count))
        print('Total word count: %d'%total)

    labelCounts = {label:0 for label in labels}
    for email in emails:
        labelCounts[email.label]+=1
    cutoff = int(len(emails)*label_cutoff)
    removed = 0
    for label in labels[:]:
        if labelCounts[label]<cutoff or label=='Important' or label=='Unread' or label=='Sent':
            removed+=1
            labels.remove(label)
    labelNums = {labels[i]:i for i in range(len(labels))}
    if verbose:
        print('Found %d labels below count threshold of %d '%(removed,cutoff))
    if verbose:
        print('Creating email dataset with labels %s '%str(labels))
        print('Label email count breakdown:')
        total = 0
        for label in labels:
            print('\t%s:%d'%(label,labelCounts[label]))
        print('Total emails: %d'%sum([labelCounts[label] for label in labels]))
    
    texts = []
    emailLabels = []
    for email in emails:
        if email.label not in labels:
            continue
        text = email.sender+" "+str(email.subject)
        text+= email.fromDomain
        text+=email.content
        texts.append(text.replace('\n','').replace('\r',''))
        emailLabels.append(labelNums[email.label])
    emailLabels = np.array(emailLabels)
    if max_n==1 or not as_matrix:
        tokenizer = Tokenizer(nb_words)
        tokenizer.fit_on_texts(texts)
        reverse_word_index = {tokenizer.word_index[word]:word for word in tokenizer.word_index}
        word_list = [reverse_word_index[i+1] for i in range(nb_words)]
        if as_matrix:
            feature_matrix = tokenizer.texts_to_matrix(texts, mode=matrix_type)
            return feature_matrix,emailLabels,word_list,labels
        else:
            sequences = tokenizer.texts_to_sequences(texts)
            return sequences,emailLabels,word_list,labels
    else:
        if matrix_type=='tfidf':
            vectorizer = TfidfVectorizer(ngram_range=(1,max_n),max_features=nb_words)
        else:
            vectorizer = CounterVectorizer(ngram_range=(1,max_n),max_features=nb_words,binary=matrix_type=='binary')
        feature_matrix = vectorizer.fit_transform(texts)
        word_list = vectorizer.get_feature_names()
        return feature_matrix,emailLabels,word_list,labels
    '=========================================================================================='
)

for i in range(len(tweets)):
    tweet_str = ' '.join(tweets[i])
    docs.append(tweet_str)
    docs_all.extend(tweet_str)

vocab = len(set(docs_all))

print('vocabulary_size:', vocab)

tokenizer = Tokenizer(num_words=vocab)
tokenizer.fit_on_texts(docs)

X = tokenizer.texts_to_matrix(docs,
                              mode='count')  # 'count'mode='freq'  mode='count'

print('tweet matrix shape:', X.shape)
#print('bag of work by counting:', X[0])

## concatenation ##########################################################

emb_size = 128

vec_list = defaultdict()
vec_float = []

embfile = open("./data/vfest_128.harp", 'r')

for line in embfile:
    a = line.strip('\n').split(' ')
 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast',
 'talk.politics.misc', 'talk.religion.misc'])

# load our saved model
model = load_model('my_model.h5')

# load tokenizer
tokenizer = Tokenizer()
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

test_files = ["C:\\DL\\20news-bydate\\20news-bydate-test\\comp.graphics\\38758",
              "C:\\DL\\20news-bydate\\20news-bydate-test\\misc.forsale\\76115",
              "C:\\DL\\20news-bydate\\20news-bydate-test\\soc.religion.christian\\21329"
              ]
x_data = []
for t_f in test_files:
    t_f_data = Path(t_f).read_text()
    x_data.append(t_f_data)

x_data_series = pd.Series(x_data)
x_tokenized = tokenizer.texts_to_matrix(x_data_series, mode='tfidf')

i=0
for x_t in x_tokenized:
    prediction = model.predict(np.array([x_t]))
    predicted_label = labels[np.argmax(prediction[0])]
    print("File ->", test_files[i], "Predicted label: " + predicted_label)
    i += 1

Esempio n. 12
0
def main():

    ### read training and testing data
    (Y_data, X_data, tag_list) = read_data(train_path, True)
    (_, X_test, _) = read_data(test_path, False)
    all_corpus = X_data + X_test
    print('Find %d articles.' % (len(all_corpus)))

    ### tokenizer for all data
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_corpus)
    word_index = tokenizer.word_index
    file = open('tokenizer.obj', 'wb')
    pickle.dump(tokenizer, file)
    ### convert word sequences to index sequence
    print('Convert to index sequences.')
    train_sequences = tokenizer.texts_to_sequences(X_data)
    test_sequences = tokenizer.texts_to_sequences(X_test)

    train_bag = tokenizer.texts_to_matrix(X_data, mode='count')[:, :10000]
    test_bag = tokenizer.texts_to_matrix(X_test, mode='count')[:, :10000]
    '''
    ### padding to equal length
    print ('Padding sequences.')
    train_sequences = pad_sequences(train_sequences)
    max_article_length = train_sequences.shape[1]
    test_sequences = pad_sequences(test_sequences,maxlen=max_article_length)
    '''
    ###
    train_tag = to_multi_categorical(Y_data, tag_list)

    ### split data into training set and validation set
    #(X_train,Y_train),(X_val,Y_val) = split_data(train_bag,train_tag,split_ratio)##
    X_train = train_bag
    Y_train = train_tag

    print(X_train.shape)
    print(Y_train.shape)
    '''
    ### get mebedding matrix from glove
    print ('Get embedding dict from glove.')
    embedding_dict = get_embedding_dict('glove.6B.%dd.txt'%embedding_dim)
    print ('Found %s word vectors.' % len(embedding_dict))
    num_words = len(word_index) + 1
    print ('Create embedding matrix.')
    embedding_matrix = get_embedding_matrix(word_index,embedding_dict,num_words,embedding_dim)
    '''
    ### build model
    print('Building model.')
    model = Sequential()
    '''
    model.add(Embedding(num_words,
                        embedding_dim,
                        weights=[embedding_matrix],
                        input_length=max_article_length,
                        trainable=False))
    '''
    #model.add(Flatten(input_shape=(max_article_length,embedding_dim)))
    model.add(Dense(128, input_shape=(X_train.shape[1], ), activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(38, activation='sigmoid'))
    model.summary()

    adam = Adam(lr=0.001, decay=1e-6, clipvalue=1.)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=[f1_score])

    earlystopping = EarlyStopping(monitor='val_f1_score',
                                  patience=10,
                                  verbose=1,
                                  mode='max')
    checkpoint = ModelCheckpoint(filepath='best.hdf5',
                                 verbose=1,
                                 save_best_only=True,
                                 save_weights_only=False,
                                 monitor='val_f1_score',
                                 mode='max')
    hist = model.fit(X_train,
                     Y_train,
                     validation_split=split_ratio,
                     epochs=nb_epoch,
                     batch_size=batch_size,
                     callbacks=[earlystopping, checkpoint])

    best_model = load_model('best.hdf5', custom_objects={'f1_score': f1_score})
    Y_pred = best_model.predict(test_bag)
    thresh = 0.4
    with open(output_path, 'w') as output:
        print('\"id\",\"tags\"', file=output)
        Y_pred_thresh = (Y_pred > thresh).astype('int')

        for index, labels in enumerate(Y_pred_thresh):
            labels = [
                tag_list[i] for i, value in enumerate(labels) if value == 1
            ]
            labels_original = ' '.join(labels)
            print('\"%d\",\"%s\"' % (index, labels_original), file=output)
    print('prediction written.')
        train_labels.append(0)
    elif int(row[lbl_y])>=1 and imp >= split_trn  and imp < split_trn + split_tst:
        #test set for imparity 133 samples
        imp+=1
        test_texts.append(row['texto'].encode('utf-8').lower())
        test_labels.append(1)
    elif int(row[lbl_y])==0 and n_imp >= split_trn  and n_imp < split_trn + split_tst:
        #test set for not imparity 133 samples
        n_imp+=1
        test_texts.append(row['texto'].encode('utf-8').lower())
        test_labels.append(0)
tokenizer = Tokenizer(nb_words=max_features, filters=keras.preprocessing.text.base_filter(), lower=True, split=" ")
tokenizer.fit_on_texts(train_texts)
train_sequences = sequence.pad_sequences( tokenizer.texts_to_sequences( train_texts ) , maxlen=maxlen )
test_sequences = sequence.pad_sequences( tokenizer.texts_to_sequences( test_texts ) , maxlen=maxlen )
train_matrix = tokenizer.texts_to_matrix( train_texts )
test_matrix = tokenizer.texts_to_matrix( test_texts )
embedding_weights = np.zeros( ( max_features , embeddings_dim ) )
affective_weights = np.zeros( ( max_features , 3 ) )
for word,index in tokenizer.word_index.items():
  try: 
    if not affective.has_key(word) : affective[word] = np.array( model.predict( np.array( embedding[word] ).reshape(1, -1) )[0] )
  except: affective[word] = np.array( [ 5.0 , 5.0 , 5.0 ] )
  if index < max_features:
    try: 
      embedding_weights[index,:] = embeddings[word]
      affective_weights[index,:] = affective[word]
    except: 
      embedding_weights[index,:] = np.random.rand( 1 , embeddings_dim )
      affective_weights[index,:] = [ 5.0 , 5.0 , 5.0 ]
Esempio n. 14
0
#remove capital letters and punctuation from both datasets
traindata['Phrase'] = traindata['Phrase'].apply(lambda x: x.lower())
traindata['Phrase'] = traindata['Phrase'].apply(
    (lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

testdata['Phrase'] = testdata['Phrase'].apply(lambda x: x.lower())
testdata['Phrase'] = testdata['Phrase'].apply(
    (lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

train_sentences = traindata['Phrase'].values
train_labels = traindata['Sentiment'].values

#tokenize and pad train sentences
train_tokenizer = Tokenizer()
train_tokenizer.fit_on_texts(train_sentences)
train_sentences = train_tokenizer.texts_to_matrix(train_sentences)
train_sentences = pad_sequences(train_sentences, maxlen=300)
vocab_size = len(train_tokenizer.word_index) + 1

#use label encoder to turn train sentiment labels into categorical data
le = preprocessing.LabelEncoder()
train_labels = le.fit_transform(train_labels)
train_labels = to_categorical(train_labels)

#tokenize and pad test sentences
test_sentences = testdata["Phrase"]
test_tokenizer = Tokenizer()
test_tokenizer.fit_on_texts(test_sentences)
test_sentences = test_tokenizer.texts_to_matrix(test_sentences)
test_sentences = pad_sequences(test_sentences, maxlen=300)
                                            "../.."))
# load the vocabulary
vocab_filename = project_path + '/ml_model/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
# load all training reviews
positive_lines = process_docs('../data/txt_sentoken/pos', vocab, True)
negative_lines = process_docs('../data/txt_sentoken/neg', vocab, True)
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
docs = negative_lines + positive_lines
tokenizer.fit_on_texts(docs)
# encode training data set
Xtrain = tokenizer.texts_to_matrix(docs, mode='freq')
ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])

# load all test reviews
positive_lines = process_docs('../data/txt_sentoken/pos', vocab, False)
negative_lines = process_docs('../data/txt_sentoken/neg', vocab, False)
docs = negative_lines + positive_lines
# encode training data set
Xtest = tokenizer.texts_to_matrix(docs, mode='freq')
ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])

n_words = Xtest.shape[1]

# load json and create model

model_json_path = project_path + '/ml_model/model.json'
                                                              genres,
                                                              test_size=0.3,
                                                              random_state=42)
test_y_eval = np.copy(
    test_y)  # create copy of test array for use in model eval

# Build embeddings
print('\n[>>> Building word embeddings and class encodings...]')
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras import utils

max_words = 5000  # vocab limit
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_lyrics)  # word index lookup for vocab
train_x = tokenizer.texts_to_matrix(train_lyrics)
test_x = tokenizer.texts_to_matrix(test_lyrics)

# One-hot encode classes
encoder = LabelEncoder()
encoder.fit(train_y)
train_y = encoder.transform(train_y)
test_y = encoder.transform(test_y)
class_labels = list(np.unique((df0.genre)))
num_classes = len(class_labels) + 1  # 0 reserved for index
train_y = utils.to_categorical(train_y, num_classes)
test_y = utils.to_categorical(test_y, num_classes)

print('\n=== Word Embeddings & Class Encodings Complete ===')
print('--- Runtime =', timer(new_time, time.time()), '---')
new_time = time.time()
Esempio n. 17
0
from keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

# We create a tokenizer, configured to only take
# into account the top-1000 most common words
tokenizer = Tokenizer(num_words=1000)
# This builds the word index
tokenizer.fit_on_texts(samples)

# This turns strings into lists of integer indices.
sequences = tokenizer.texts_to_sequences(samples)

# You could also directly get the one-hot binary representations.
# Note that other vectorization modes than one-hot encoding are supported!
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')

# This is how you can recover the word index that was computed
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

# We will store our words as vectors of size 1000.
# Note that if you have close to 1000 words (or more)
# you will start seeing many hash collisions, which
# will decrease the accuracy of this encoding method.
dimensionality = 1000
max_length = 10

results = np.zeros((len(samples), max_length, dimensionality))
Esempio n. 18
0
#print(input_name)

while True:

    # Get whether to pull or not
    done = firebase.get('/response', 'done')
    print(done)

    # User has finished entering commands, get new update and feed into network and post
    if done:
        # Get three query fields
        topic_json = firebase.get('/response', 'topic')
        topic = parse_json(topic_json)
        category_arr = text_to_word_sequence(topic)
        category = tok.texts_to_matrix(category_arr, mode='count')

        language_json = firebase.get('/response', 'language')
        language = parse_json(language_json)
        tech_arr = text_to_word_sequence(language)
        tech = tok2.texts_to_matrix(tech_arr, mode='count')

        platform_json = firebase.get('/response', 'platform')
        platform = parse_json(platform_json)
        tprogram_arr = text_to_word_sequence(platform)
        tprogram = tok3.texts_to_matrix(tprogram_arr, mode='count')

        # Get Predictor
        input_pred = np.zeros((1, 9))

        input_type_index = 0
Esempio n. 19
0
    return train, test


data = pd.read_csv("data/bbc-text.csv")
train_x, test_x = split_test_train(data["text"])
train_y, test_y = split_test_train(data["category"])

print("Training set size: {0}, Test set size: {1}".format(
    len(train_x), len(test_x)))

preprocess_start = time.time()
#Preprocess X
max_words = 1000
t = Tokenizer(num_words=max_words)
t.fit_on_texts(train_x)
train_one_hot_x = t.texts_to_matrix(train_x)  #, mode = 'count')
test_one_hot_x = t.texts_to_matrix(test_x)  #, mode = 'count')

#Preprocess Y
values = list(set(train_y))

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(train_y)
y_encoded = y_encoded.reshape(len(y_encoded), 1)
onehot_encoder = OneHotEncoder(sparse=False)
train_y_onehot = onehot_encoder.fit_transform(y_encoded)

y_encoded = label_encoder.fit_transform(test_y)
y_encoded = y_encoded.reshape(len(y_encoded), 1)
onehot_encoder = OneHotEncoder(sparse=False)
test_y_onehot = onehot_encoder.fit_transform(y_encoded)
Esempio n. 20
0
def main():

    ### read training and testing data
    (Y_data, X_data, tag_list) = read_data(train_path, True)
    (_, X_test, _) = read_data(test_path, False)
    all_corpus = X_data + X_test
    print('Find %d articles.' % (len(all_corpus)))

    ### tokenizer for all data
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_corpus)
    word_index = tokenizer.word_index
    pickle.dump(tokenizer, open('tk', 'wb'))
    ### convert word sequences to index sequence
    print('Convert to index sequences.')
    train_matrix = tokenizer.texts_to_matrix(X_data, mode='tfidf')
    test_matrix = tokenizer.texts_to_matrix(X_test, mode='tfidf')

    ### padding to equal length
    #print ('Padding sequences.')
    #train_sequences = pad_sequences(train_sequences)
    #max_article_length = train_sequences.shape[1]
    #test_sequences = pad_sequences(test_sequences,maxlen=max_article_length)

    ###
    train_tag = to_multi_categorical(Y_data, tag_list)

    ### split data into training set and validation set
    (X_train, Y_train), (X_val, Y_val) = split_data(train_matrix, train_tag,
                                                    split_ratio)
    #X_train = X_train.reshape((X_train.shape[0],1,X_train.shape[1]))
    print(Y_train.shape)
    print(X_train.shape)
    ### get mebedding matrix from glove
    # print ('Get embedding dict from glove.')
    # embedding_dict = get_embedding_dict('glove/glove.6B.%dd.txt'%embedding_dim)
    # print ('Found %s word vectors.' % len(embedding_dict))
    # num_words = len(word_index) + 1
    # print ('Create embedding matrix.')
    # embedding_matrix = get_embedding_matrix(word_index,embedding_dict,num_words,embedding_dim)

    ### build model
    print('Building model.')

    for x in range(20):
        model = Sequential()
        print(x)
        model.add(Dense(512, activation='elu', input_dim=40587))
        model.add(Dropout(0.5))
        model.add(Dense(512, activation='tanh'))
        model.add(Dropout(0.5))
        model.add(Dense(512, activation='elu'))
        model.add(Dropout(0.5))
        model.add(Dense(512, activation='elu'))
        model.add(Dropout(0.5))
        # model.add(Dense(512,activation='elu'))
        # model.add(Dropout(0.5))
        # model.add(Dense(128,activation='elu'))
        # model.add(Dropout(0.5))
        model.add(Dense(38, activation='sigmoid'))
        model.summary()

        adam = Adam(lr=0.001, decay=1e-6, clipvalue=0.5)
        tmp = str(x) + '.hdf5'
        model.compile(loss='categorical_crossentropy',
                      optimizer=adam,
                      metrics=[f1_score])

        earlystopping = EarlyStopping(monitor='val_f1_score',
                                      patience=10,
                                      verbose=1,
                                      mode='max')
        checkpoint = ModelCheckpoint(filepath=tmp,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=False,
                                     monitor='val_f1_score',
                                     mode='max')

        hist = model.fit(X_train,
                         Y_train,
                         validation_data=(X_val, Y_val),
                         epochs=1000,
                         batch_size=batch_size,
                         callbacks=[earlystopping, checkpoint])

    Y_pred = model.predict(test_matrix)
    thresh = 0.4
    with open(output_path, 'w') as output:
        print('\"id\",\"tags\"', file=output)
        Y_pred_thresh = (Y_pred > thresh).astype('int')
        for index, labels in enumerate(Y_pred_thresh):
            labels = [
                tag_list[i] for i, value in enumerate(labels) if value == 1
            ]
            labels_original = ' '.join(labels)
            print('\"%d\",\"%s\"' % (index, labels_original), file=output)
Esempio n. 21
0
'''
list_x = pad_sequences(list_x, maxlen=seqlen,truncating='pre')  
pre_seq = pad_sequences(pre_seq, maxlen=seqlen,truncating='pre')
'''
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(pre_seq + list_x + nllist)
word_index = tokenizer.word_index
#
seq_test = tokenizer.texts_to_sequences(pre_seq)
print('seq_size=', len(seq_test))
seq_train = tokenizer.texts_to_sequences(list_x)
data1 = pad_sequences(seq_train, maxlen=31, truncating='pre')
test1 = pad_sequences(seq_test, maxlen=31, truncating='pre')
data = []
pre_data = []
data.append(tokenizer.texts_to_matrix(pre_seq, mode='binary'))
for i in range(4):
    print(trainlist[i])
    print(data[0])
    print(labels[i])
for i in range(10):
    print('test=', test1[i])
pre_data = tokenizer.sequences_to_matrix(test1, mode='binary')
#===============================================

#=======bulid model==========

labels = to_categorical(np.asarray(list_y))
for i in range(4):
    print(trainlist[i])
    print(data[i])
# In[ ]:


# Listing 6.3 Using Keras for word-level one-hot encoding

from keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

tokenizer = Tokenizer(num_words = 1000)
tokenizer.fit_on_texts(samples)

sequences = tokenizer.texts_to_sequences(samples)

oneHotResults = tokenizer.texts_to_matrix(samples, mode = 'binary')

wordIndex = tokenizer.word_index
print('Found %s unique tokens.' % len(wordIndex))


# For data where the number of unique tokens is extremely large, we can use *one-hot hashing trick* which hashes words into vectors of fixed size, rather than assigning an index to each.
# 
# To avoid having multiple words assigned to the same has (called *hash collisions*), the dimensionality of the hashing space should be much larger than the total number of unique tokens.

# In[ ]:


# Listing 6.4 Word-level one-hot encoding with hashing trick

samples = ['The cat sat on the mat.', 'The dog ate my homework.']
Esempio n. 23
0
# using keras for word-level one-hot encoding
from keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples)

sequences = tokenizer.texts_to_sequences(samples)

one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')

word_index = tokenizer.word_index
print(len(word_index))
print(sequences)
print(one_hot_results)
Esempio n. 24
0
    elif int(row[lbl_y]
             ) == 0 and n_imp >= split_trn and n_imp < split_trn + split_tst:
        #test set for not imparity 133 samples
        n_imp += 1
        test_texts.append(row['texto'].encode('utf-8').lower())
        test_labels.append(0)
tokenizer = Tokenizer(nb_words=max_features,
                      filters=keras.preprocessing.text.base_filter(),
                      lower=True,
                      split=" ")
tokenizer.fit_on_texts(train_texts)
train_sequences = sequence.pad_sequences(
    tokenizer.texts_to_sequences(train_texts), maxlen=maxlen)
test_sequences = sequence.pad_sequences(
    tokenizer.texts_to_sequences(test_texts), maxlen=maxlen)
train_matrix = tokenizer.texts_to_matrix(train_texts)
test_matrix = tokenizer.texts_to_matrix(test_texts)
embedding_weights = np.zeros((max_features, embeddings_dim))
affective_weights = np.zeros((max_features, 3))
for word, index in tokenizer.word_index.items():
    try:
        if not affective.has_key(word):
            affective[word] = np.array(
                model.predict(np.array(embedding[word]).reshape(1, -1))[0])
    except:
        affective[word] = np.array([5.0, 5.0, 5.0])
    if index < max_features:
        try:
            embedding_weights[index, :] = embeddings[word]
            affective_weights[index, :] = affective[word]
        except:
Esempio n. 25
0
from keras.layers import Embedding
import matplotlib.pyplot as plt
import numpy as np
from keras.callbacks import TensorBoard
from time import time

df = pd.read_csv('imdb_master.csv', encoding='latin-1')
print(df.head())
sentences = df['review'].values
y = df['label'].values

#tokenizing data
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(sentences)
#getting the vocabulary of data
sentences = tokenizer.texts_to_matrix(sentences)

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(sentences,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=1000)

# Number of features
#print(input_dim)
model = Sequential()
# model.add(layers.Dense(300,input_dim=500, activation='relu'))
model.add(layers.Dense(100, activation='sigmoid'))

model.compile(loss='sparse_categorical_crossentropy',
Esempio n. 26
0
TextData = np.asarray(DM.get_arr(dataJS, TextTypes))
AnswerData = target

X = TextData
Y = DM.to_one_hot(AnswerData)

indices = DM.mixedIndex(X)
X = X[indices]
Y = Y[indices]

tokinizer = Tokenizer(num_words=3000)
tokinizer.fit_on_texts(X)
sequences = tokinizer.texts_to_sequences(X)

one_hot_results = tokinizer.texts_to_matrix(X, mode="binary")
X = np.array(one_hot_results)
X = np.asarray(X).astype('int')

model = models.Sequential()
model.add(layers.Dense(100, activation="relu", input_shape=(X.shape[1], )))
model.add(layers.Dropout(0.15))

model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.1))

#model.add(layers.Dense(16, activation="relu"))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dense(3, activation='softmax'))