Exemple #1
0
def LoadSMILESData(duplicateProb = 0,seed=7):
    dataComp = dataset.LoadData('data',0)
    smiles = list(map(lambda x: x._SMILE, dataComp))
    tokenizer = Tokenizer(num_words=None, char_level=True)
    tokenizer.fit_on_texts(smiles)
    print(smiles[0])
    dictionary = {}
    i=0
    k=0
    for smile in smiles:
        i+=1
        for c in list(smile):
            k+=1
            if c in dictionary:
                dictionary[c]+=1
            else:
                dictionary[c]=1
    print(len(dictionary))
    # sequence encode
    encoded_docs = tokenizer.texts_to_sequences(smiles)
    # pad sequences
    max_length = max([len(s) for s in smiles])
    vocab = {'C': 1, 'c': 2, '(': 3, ')': 4, 'O': 5, '=': 6, '1': 7, 'N': 8, '2': 9, '3': 10, '[': 11, ']': 12, 'F': 13, '4': 14, 'l': 15, 'n': 16, 'S': 17, '@': 18, 'H': 19, '5': 20, '+': 21, '-': 22, 'B': 23, 'r': 24, '\\': 25, '#': 26, '6': 27, '.': 28, '/': 29, 's': 30, 'P': 31, '7': 32, 'i': 33, 'o': 34, '8': 35, 'I': 36, 'a': 37, '%': 38, '9': 39, '0': 40, 'K': 41, 'e': 42, 'A': 43, 'g': 44, 'p': 45, 'M': 46, 'T': 47, 'b': 48, 'd': 49, 'V': 50, 'Z': 51, 'G': 52, 'L': 53}
    Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    # define vocabulary size (largest integer value)
    labels = list(map(lambda x: 1 if x.mutagen==True else 0,dataComp))
    return Xtrain, labels,vocab,max_length
def get_data_1(train_sents, maxlen):
    word_list = []
    for i in range(len(train_sents)):
        for words in train_sents[i]:
            word_list.append(words)
    
    sequence=[]
    stride=1
    #applying windowing for sequence genration

    for i in range(0,len(word_list)-maxlen,stride):
        line=word_list[i:i+maxlen]
        sequence.append(line)
    
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(sequence)
    seq=tokenizer.texts_to_sequences(sequence)
    vocab_len=len(tokenizer.word_index.items())+1
    
    seq=np.array(seq)
    x_train=seq[:,:-1]
    y_train=np.zeros((x_train.shape[0],x_train.shape[1],1))
    for i in range(x_train.shape[0]):
        for j in range(x_train.shape[1]):
            y_train[i,j,0]=seq[i,j+1]
        
    return x_train,y_train,vocab_len,tokenizer
Exemple #3
0
def train(dataReader, oneHot, oneHotAveraged, contextHashes):
	n = (Epochs + 1) * SamplesPerEpoch  # TODO + 1 should not be needed

	tokeniser = Tokenizer(nb_words=MaxWords)
	tokeniser.fit_on_texts((row[0] for row in dataReader.trainingData(n)))

	# `word_index` maps each word to its unique index
	dictionarySize = len(tokeniser.word_index) + 1

	oneHotDimension        = (1 if oneHotAveraged else SequenceLength) * dictionarySize if oneHot else 0
	contextHashesDimension = dictionarySize * 2 if contextHashes else 0

	model = Sequential()
	model.add(Dense(EmbeddingDim, input_dim=(oneHotDimension + contextHashesDimension)))
	model.add(Dense(Labels, activation='softmax'))
	model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

	trainingGenerator   = mapGenerator(dataReader.trainingData(n),   tokeniser, dictionarySize, oneHot, oneHotAveraged, contextHashes)
	validationGenerator = mapGenerator(dataReader.validationData(n), tokeniser, dictionarySize, oneHot, oneHotAveraged, contextHashes)

	model.fit_generator(trainingGenerator,
		nb_epoch=Epochs,
		samples_per_epoch=SamplesPerEpoch,
		validation_data=validationGenerator,
		nb_val_samples=SamplesPerEpoch)

	model2 = Sequential()
	model2.add(Dense(EmbeddingDim, input_dim=(oneHotDimension + contextHashesDimension), weights=model.layers[0].get_weights()))

	return model, model2, tokeniser, dictionarySize
def get_train_val_matrix(texts, labels, max_features=10000, max_len=100):
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(texts)
    sequens = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print(f'Found {len(word_index)} unique tokens')

    data = pad_sequences(sequens, maxlen=max_len)

    labels = np.asarray(labels)

    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)

    data = data[indices]
    labels = labels[indices]

    train_sample_n = 20000
    validation_sample_n = 5000

    x_train = data[:train_sample_n]
    x_val = data[train_sample_n:validation_sample_n+train_sample_n]
    y_train = labels[:train_sample_n]
    y_val = labels[train_sample_n:validation_sample_n+train_sample_n]

    return (x_train, y_train), (x_val, y_val), word_index
Exemple #5
0
 def read_copus_generator(self, batch_size=64):
     """ return a generator with the specified batch_size
     """
     logger.info("Beigin read copus {0}".format(file_name))
     data = []
     index = 0
     with open(file_name, 'r') as fread:
         while True:
             try:
                 line = fread.readline()
                 data.append(line)
                 index += 1
                 if index % 100000 == 0:
                     logger.info("The program has processed {0} lines ".
                                 format(index))
             except:
                 logger.info("Read End")
                 break
     tokenizer = Tokenizer(nb_words=30000)
     tokenizer.fit_on_texts(data)
     logger.info("word num: {0}".format(len(tokenizer.word_counts)))
     sorted_word_counts = sorted(
         tokenizer.word_counts.items(),
         key=operator.itemgetter(1),
         reverse=True)
     # save the word_counts to the meta
     with open(file_name.replace("train.", "meta."), "w") as fwrite:
         for word_cnt in sorted_word_counts:
             key = word_cnt[0]
             val = word_cnt[1]
             line = key + ":" + str(val) + "\n"
             fwrite.write(line)
     vectorize_data = tokenizer.texts_to_matrix(data)
     return vectorize_data
Exemple #6
0
class SequenceTransformer(BaseEstimator, TransformerMixin):
    " Transforms np array of strings into sequences"

    def __init__(self, analyzer='word', max_features=10000, max_len=100):
        self.max_len = max_len
        self.analyzer = analyzer
        self.max_features = max_features
    

    def transform(self, X, y=None):

        try:
            getattr(self, "transformer_")
        except AttributeError:
            raise RuntimeError("You must fit transformer before using it!")

        X_seq = self.transformer_.texts_to_sequences(list(X))
        X_seq = sequence.pad_sequences(X_seq, maxlen=self.max_len)
        return X_seq


    def fit(self, X, y=None):

        if self.analyzer == 'char':
            char_level = True
        elif self.analyzer == 'word':
            char_level = False
        else:
            print("invalid analyzer")
            return

        self.transformer_ = Tokenizer(nb_words=self.max_features, lower=True, char_level = char_level)
        self.transformer_.fit_on_texts(X)

        return self
def get_fitted_tokenizer(df_train, df_test):
    comments_train = df_train[COMMENT_COL].values.tolist()
    comments_test = df_test[COMMENT_COL].values.tolist()
    tokenizer = Tokenizer()
    # tokenizer.num_words = MAX_NUM_WORDS
    tokenizer.fit_on_texts(comments_train + comments_test)
    return tokenizer
def prepare_tokenizer(words):
    '''
        funtion to generate vocabulary of the given list of words
        implemented by Anindya
        @param
        words => the list of words to be tokenized
    '''
    # obtain a tokenizer
    t = Tokenizer(filters = '') # don't let keras ignore any words
    t.fit_on_texts(words)
    field_dict = dict(); rev_field_dict = dict()

    for key,value in t.word_index.items():
        field_dict[value] = key
        rev_field_dict[key] = value

    vocab_size = len(t.word_index) + 1

    ''' Small modification from Animesh
        # also add the '<unk>' token to the dictionary at 0th position
    '''
    field_dict[0] = '<unk>'; rev_field_dict['<unk>'] = 0

    #print (vocab_size)
	# integer encode the documents
    encoded_docs = t.texts_to_sequences(words)

    # print "debug: " + str(encoded_docs)

	#print(padded_docs)
    return np.array(encoded_docs), field_dict, rev_field_dict, vocab_size
Exemple #9
0
def mlp_model(X_train, y_train, X_test, y_test):
    tokenizer = Tokenizer(nb_words=1000)
    nb_classes = np.max(y_train) + 1

    X_train = tokenizer.sequences_to_matrix(X_train, mode="freq")
    X_test = tokenizer.sequences_to_matrix(X_test, mode="freq")

    Y_train = np_utils.to_categorical(y_train, nb_classes)
    Y_test = np_utils.to_categorical(y_test, nb_classes)

    print("Building model...")
    model = Sequential()
    model.add(Dense(512, input_shape=(max_len,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', class_mode='categorical')

    history = model.fit(X_train, Y_train, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, show_accuracy=True, validation_split=0.1)
    model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1, show_accuracy=True)
    # print('Test score:', score[0])
    # print('Test accuracy:', score[1])
    pred_labels = model.predict_classes(X_test)
    # print pred_labels
    # print y_test
    accuracy = accuracy_score(y_test, pred_labels)
    precision, recall, f1, supp = precision_recall_fscore_support(y_test, pred_labels, average='weighted')
    print precision, recall, f1, supp

    return accuracy, precision, recall, f1
Exemple #10
0
def test_tokenizer_unicode():
    texts = [u'ali veli kırk dokuz elli',
             u'ali veli kırk dokuz elli veli kırk dokuz']
    tokenizer = Tokenizer(num_words=5)
    tokenizer.fit_on_texts(texts)

    assert len(tokenizer.word_counts) == 5
Exemple #11
0
class Featurizer:

    max_words = None
    tokenizer = None

    def __init__(self, max_words=1000):
        self.max_words = max_words
        self.tokenizer = Tokenizer(num_words=max_words)

    def fit_transform(self, data):
        texts = [l['text'] for l in data]
        self.tokenizer.fit_on_texts(texts)
        # remove words that cross the max_words limit
        self.tokenizer.word_index = {k: v for k, v in self.tokenizer.word_index.items() if v <= self.max_words}
        return self.transform(data)

    def transform(self, data):
        texts = [l['text'] for l in data]
        return self.tokenizer.texts_to_matrix(texts, mode='binary')

    def transform_inv(self, m):
        index = {v: k for k, v in self.tokenizer.word_index.items()} # word index by id
        return [[index.get(i) for i in np.nonzero(line)[0] if i in index] for line in m]

    def save(self, filepath):
        with open(filepath + '_word_index.json', 'w') as f:
            f.write(json.dumps(self.tokenizer.word_index))

    @classmethod
    def load(cls, filepath):
        with open(filepath + '_word_index.json', 'r') as f:
            word_index = json.load(f)
            c = cls(max_words=len(word_index))
            c.tokenizer.word_index = word_index
            return c
def preprocess_embedding():
    corpus_train, target, filenames = get_corpus()
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus_train)
    sequences = tokenizer.texts_to_sequences(corpus_train)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    MAX_SEQUENCE_LENGTH = 50
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('/home/flippped/Desktop/xiangmu/baseline/GoogleNews-vectors-negative300.bin.gz', binary=True)
    word2vec_model.init_sims(replace=True)

    # create one matrix for documents words
    EMBEDDING_DIM = 300
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    print embedding_matrix.shape
    for word, i in word_index.items():
            try:
                embedding_vector = word2vec_model[str(word)]
                if embedding_vector is not None:
                    # words not found in embedding index will be all-zeros.
                    embedding_matrix[i] = embedding_vector

            except:
                continue


    return data,target,filenames,embedding_matrix, word_index
def get_fitted_tokenizer(df_train, df_test):
    comments_train = df_train[COMMENT_COL].values.tolist()
    comments_test = df_test[COMMENT_COL].values.tolist()
    # remain '!' and '?'
    tokenizer = Tokenizer(filters='"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n')
    tokenizer.fit_on_texts(comments_train + comments_test)
    return tokenizer
Exemple #14
0
def preproc_for_sklearn(X, y, nb_features):
    try:
        tokenizer = Tokenizer(num_words=nb_features)
    except:
        tokenizer = Tokenizer(num_words=nb_features)
    X = tokenizer.sequences_to_matrix(X, mode='binary')

    return X, y
Exemple #15
0
def tokenaize(train_path, dev_path):
    with open(train_path) as fd:
        data = fd.read()
    with open(dev_path) as fd:
        data += fd.read()
    tokenizer = Tokenizer(split='\t', oov_token='<UNK>')
    tokenizer.fit_on_texts([data])
    return tokenizer
Exemple #16
0
def tokenizeAndGenerateIndex(texts):
    tokenizer = Tokenizer(nb_words=vocab_size)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=maxlen,padding='post')
    return data
Exemple #17
0
    def handle(self, *args, **options):
        ptt = PTT.objects.all()
        ptt_json = PTTSerializer(ptt, many=True).data
        user_comments_times = dict()
        labels_index = 2
        labels = []
        texts = []
        for article in ptt_json:
            pointer = 1 if article['score'] > 0 else 0
            words = jieba.cut(article['contents'])
            for word in words:
                labels.append(pointer)
                texts.append(word)
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(texts)
        sequences = tokenizer.texts_to_sequences(texts)
        data = pad_sequences(sequences, maxlen=self.MAX_SEQUENCE_LENGTH)
        labels = to_categorical(np.asarray(labels))
        print('Shape of data tensor:', data.shape)
        print('Shape of label tensor:', labels.shape)
        print('Token word index:', tokenizer.word_index)
        indices = np.arange(data.shape[0])
        np.random.shuffle(indices)
        data = data[indices]
        labels = labels[indices]
        nb_validation_samples = int(self.VALIDATION_SPLIT * data.shape[0])

        x_train = data[:-nb_validation_samples]
        y_train = labels[:-nb_validation_samples]
        x_val = data[-nb_validation_samples:]
        y_val = labels[-nb_validation_samples:]


        print('Training model.')

        # train a 1D convnet with global maxpooling
        sequence_input = Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32')
        x = Embedding(output_dim=100, input_dim=len(tokenizer.word_index), input_length=self.MAX_SEQUENCE_LENGTH)(sequence_input)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(35)(x)
        x = Flatten()(x)
        x = Dense(128, activation='relu')(x)
        preds = Dense(labels_index, activation='softmax')(x)
        model = Model(sequence_input, preds)
        model.compile(loss='categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['acc'])

        # happy learning!
        model.fit(x_train, y_train, validation_data=(x_val, y_val),
                  nb_epoch=2, batch_size=64)
        score = model.evaluate(x_val, y_val, verbose=0)
        print('Test score:', score[0])
        print('Test accuracy:', score[1])
Exemple #18
0
def question_to_input(df_q1,df_q2):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df_q1 + df_q2)
    encoded_1 = tokenizer.texts_to_sequences(df_q1)
    encoded_2 = tokenizer.texts_to_sequences(df_q2)
    question_input_train = sequence.pad_sequences(encoded_1, maxlen=15)
    question_input_test = sequence.pad_sequences(encoded_2, maxlen=15)

    return question_input_train,question_input_test
def fit_tokenizer(fname, open_encoding='utf-8'):
    file = open(fname, 'r', encoding=open_encoding)
    text = file.read()
    file.close()
    texts = [text]
    # 不过滤低频词
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    return tokenizer
def get_tokenizer(train_comments, nwords):
    print("getting tokenizer..")
    
    t = Tokenizer(num_words=nwords)
    texts = train_comments
    t.fit_on_texts(texts)
    sequences = t.texts_to_sequences(texts)
    
    return (t,sequences)
Exemple #21
0
def tokenize(texts, texts_train, texts_test):
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
    tokenizer.fit_on_texts(texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    sequences_train = tokenizer.texts_to_sequences(texts_train)
    sequences_test = tokenizer.texts_to_sequences(texts_test)
    return word_index, sequences_train, sequences_test
def keras_classify(df):
    # 预处理,把 text 中的词转成数字编号
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing import sequence
    from keras.callbacks import EarlyStopping
    from sklearn.cross_validation import train_test_split

    print "----- Classification by Keras -----"
    max_features = 50000  # 只选最重要的词
    # Tokenizer 只能处理 str,不能处理 unicode
    textraw = map(lambda x: x.encode('utf-8'), df.seg_word.values.tolist())
    token = Tokenizer(nb_words=max_features)
    # 由于 df.seg_word 以空格相隔,故此这里 Tokenizer 直接按英文方式处理 str 即可完成分词
    token.fit_on_texts(textraw)
    # token 中记录了每个词的编号和出现次数,这里使用词编号来代替 textraw 中的词文本
    # 如 textraw = ['a b c', 'c d e f']  ==> text_seq = [[1, 2, 3], [3, 4, 5, 6]]
    text_seq = token.texts_to_sequences(textraw)
    nb_classes = len(np.unique(df.label.values))
    print "num of features(vocabulary): ", len(token.word_counts)
    print "num of labels: ", nb_classes
    max_sent_len = np.max([len(s) for s in text_seq])
    print "max length or document is: ", max_sent_len
    median_sent_len = np.median([len(s) for s in text_seq])
    print "median length or document is: ", median_sent_len

    # 这里的 df.label.values 中 values 不能忽略,否则后面 np_utils.to_categorical 时会出错
    train_X, test_X, train_y, test_y = train_test_split(text_seq, df.label.values, train_size=0.7, random_state=1)
    # 目前 train_X & test_X 仍然不是等长的,其每行都是一个 document,需要化为等长的矩阵才能训练
    seqlen = int(max_sent_len / 2 + median_sent_len / 2)
    X_train = sequence.pad_sequences(train_X, maxlen=seqlen, padding='post', truncating='post')
    X_test = sequence.pad_sequences(test_X, maxlen=seqlen, padding='post', truncating='post')
    # 把 y 格式展开为 one-hot,目的是在 nn 的最后采用 softmax
    Y_train = np_utils.to_categorical(train_y, nb_classes)
    Y_test = np_utils.to_categorical(test_y, nb_classes)

    model = build_cnn_model(max_features, seqlen, nb_classes)
    earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)
    # 训练 10 轮,每轮 mini_batch 为 32,训练完调用 earlystop 查看是否已经 ok
    model.fit(X_train, Y_train, batch_size=32, nb_epoch=10, validation_split=0.1, callbacks=[earlystop])
    evaluate(earlystop.model, X_test, Y_test, test_y)

    model = build_lstm_model(max_features, seqlen, nb_classes)
    model.fit(X_train, Y_train, batch_size=32, nb_epoch=1, validation_split=0.1)
    evaluate(model, X_test, Y_test, test_y)

    model = build_mixed_model(max_features, seqlen, nb_classes)
    model.fit(X_train, Y_train, batch_size=32, nb_epoch=1, validation_split=0.1)
    evaluate(model, X_test, Y_test, test_y)

    graph = build_graph_model(max_features, seqlen, nb_classes)
    graph.fit({'input': X_train, 'output': Y_train}, nb_epoch=3, batch_size=32, validation_split=0.1)
    predict = graph.predict({'input': X_test}, batch_size=32)
    predict = predict['output']
    classes = predict.argmax(axis=1)
    acc = np_utils.accuracy(classes, test_y)
    print('Test accuracy: ', acc)
def df2seq(df, nb_words):

    textraw = df.EssayText.values.tolist()
    textraw = [line.encode('utf-8') for line in textraw]  # keras needs str

    # keras deals with tokens
    token = Tokenizer(nb_words=nb_words)
    token.fit_on_texts(textraw)
    text_seq = token.texts_to_sequences(textraw)
    return(text_seq, df.Score1.values)
Exemple #24
0
def tokenizeAndGenerateIndex(train, test, maxFeatures, maxLength):
  merged = np.concatenate([train, test])
  tokenizer = Tokenizer(nb_words=maxFeatures)
  tokenizer.fit_on_texts(merged)
  sequences_train = tokenizer.texts_to_sequences(train)
  sequences_test = tokenizer.texts_to_sequences(test)
  word_index = tokenizer.word_index
  print('Found %s unique tokens.' % len(word_index))
  data_train = pad_sequences(sequences_train, maxlen=maxLength)
  data_test = pad_sequences(sequences_test, maxlen=maxLength)
  return data_train, data_test, word_index
Exemple #25
0
 def word_to_index(self, text, tok=None):
     real_text = [' '.join(z) for z in text]
     if tok is None:
         tokenizer = Tokenizer(lower=False, filters=" ")
         tokenizer.fit_on_texts(real_text)
     else:
         tokenizer = tok
     # here do not need the loop, just put the list of sentences (str) as input
     sequences = tokenizer.texts_to_sequences(real_text)
     # tokenizer.word_docs.items()
     return sequences, tokenizer
def prepare_tokenizer(words, max_word_length = None):
    '''
        funtion to generate vocabulary of the given list of words
        implemented by Anindya
        @param
        words => the list of words to be tokenized
    '''
    # flatten the words list:
    print("flattening the words into a single sequence ... ")
    flat_words = []; # initialize to empty list
    for i in range(len(words)):
        flat_words += words[i]
        if(i % 10000 == 0):
            print("joined", i, "examples")

    # obtain a tokenizer
    print("\nmaximum words to work with: ", max_word_length)
    t = Tokenizer(num_words = max_word_length, filters = '') # don't let keras ignore any words
    print("\nKeras's tokenizer kicks off ... ")
    t.fit_on_texts(flat_words)
    field_dict = dict(); rev_field_dict = dict()

    print("\nbuilding the dict and the rev_dict ... ")
    if(max_word_length is not None):
        vals = t.word_index.items()
        vals = sorted(vals, key=lambda x: x[1])
        for key,value in vals[:max_word_length - 1]:
            field_dict[value] = key
            rev_field_dict[key] = value
    else:
        for key,value in t.word_index.items():
            field_dict[value] = key
            rev_field_dict[key] = value


    ''' Small modification from Animesh
        # also add the '<unk>' token to the dictionary at 0th position
    '''
    field_dict[0] = '<unk>'; rev_field_dict['<unk>'] = 0


    print("\nencoding the words using the dictionary ... ")
    for i in range(len(words)):
        for j in range(len(words[i])):
            if(words[i][j] in rev_field_dict):
                words[i][j] = rev_field_dict[words[i][j]]
            else:
                words[i][j] = rev_field_dict['<unk>']

        if(i % 10000 == 0):
            print("encoded", i, "examples")

    vocab_size = len(field_dict)
    return words, field_dict, rev_field_dict, vocab_size
Exemple #27
0
def save_tokenizer(question1, question2):
    questions = question1 + question2
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(questions)
    word_index = tokenizer.word_index
    print("Words in index: %d" % len(word_index))

    # save tokenizer
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return word_index
def load_mr(nb_words=20000, maxlen=64, embd_type='self'):
    """
    :param embd_type: self vs. w2v
    :return:
    """

    train_size = 0.8

    df = pickled2df('data/mr.p')
    print(df.head())

    train_X, test_X, train_y, test_y = train_test_split(df.text.values.tolist(),
                                                        df.label.values,
                                                        train_size=train_size, random_state=1)
    train_X_wds = train_X
    test_X_wds = test_X

    nb_classes = len(np.unique(train_y))
    Y_train = np_utils.to_categorical(train_y, nb_classes)
    Y_test  = np_utils.to_categorical(test_y, nb_classes)

    # tokenrize should be applied on train+test jointly
    n_ta = len(train_X)
    n_ts = len(test_X)
    print('train len vs. test len', n_ta, n_ts)

    textraw = [line.encode('utf-8') for line in train_X+test_X]  # keras needs str
    # keras deals with tokens
    token = Tokenizer(nb_words=nb_words)
    token.fit_on_texts(textraw)
    textseq = token.texts_to_sequences(textraw)

    # stat about textlist
    print('nb_words: ',len(token.word_counts))
    print('mean len: ',np.mean([len(x) for x in textseq]))

    train_X = textseq[0:n_ta]
    test_X = textseq[n_ta:]

    if(embd_type == 'self'):
        X_train = xcol_nninput_embd(train_X, nb_words, maxlen)
        X_test  = xcol_nninput_embd(test_X,  nb_words, maxlen)
    elif(embd_type == 'w2v'):
        w2v = load_w2v('data/Google_w2v.bin')
        print("loaded Google word2vec")
        X_train = sents_3dtensor(train_X_wds, maxlen, w2v)
        X_test  = sents_3dtensor(test_X_wds, maxlen, w2v)
    else:
        print('wrong embd_type')

    print('X tensor shape: ', X_train.shape)
    print('Y tensor shape: ', Y_train.shape)
    return (X_train, Y_train, X_test, Y_test, nb_classes)
Exemple #29
0
def NNclassify(X_train,X_test,y_train,y_test,inputtype):
	classtype="gender"
	max_words=10000
	batch_size=32
	nb_epoch=20
	if inputtype=='categorical':
		nb_epoch=10
		classtype="age"

	print('Loading data...')
	print(len(X_train), 'train instances')
	print(len(X_test), 'test instances')

	nb_classes = np.max(y_train)+1
	print(nb_classes, 'classes')

	print('Vectorizing sequence data...')
	tokenizer = Tokenizer(nb_words=max_words)
	X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
	X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
	print('X_train shape:', X_train.shape)
	print('X_test shape:', X_test.shape)

	print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')
	Y_train = np_utils.to_categorical(y_train, nb_classes)
	Y_test = np_utils.to_categorical(y_test, nb_classes)
	print('Y_train shape:', Y_train.shape)
	print('Y_test shape:', Y_test.shape)

	print('Building model...')
	model = Sequential()

	model.add(MaxoutDense(100, input_shape=(max_words,)))
	model.add(Dropout(0.7))
	model.add(Dense(nb_classes,init='uniform'))
	model.add(Activation('softmax'))

	model.compile(loss='categorical_crossentropy', optimizer='adam',class_mode=inputtype)
	history = model.fit(X_train, Y_train, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, show_accuracy=True, validation_split=0.1)
	score = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1, show_accuracy=True)
	print('Test score:', score[0])
	print('Test accuracy:', score[1])

	prediction=model.predict(X_test, batch_size=batch_size, verbose=1)
	pred_classes = np.argmax(prediction, axis=1)
	print(Counter(pred_classes))

	results=open('results.txt', 'a')
	results.write("{} \t {} features \t {} epochs \t {} batch size \t {} accuracy \n".format(classtype, max_words, nb_epoch, batch_size,score[1]))
	results.close()

	return pred_classes
def load_csvs(traincsv, testcsv, nb_words, maxlen, embd_type):

    train_df = pd.read_csv(traincsv)
    test_df = pd.read_csv(testcsv)
    print(train_df.head())

    train_X = train_df.text.values.tolist()
    test_X = test_df.text.values.tolist()

    # save for w2v embd
    train_X_wds = train_X
    test_X_wds = test_X

    train_y = train_df.label.values
    test_y  = test_df.label.values
    nb_classes = len(np.unique(train_y))
    Y_train = np_utils.to_categorical(train_y, nb_classes)
    Y_test = np_utils.to_categorical(test_y, nb_classes)

    # tokenrize should be applied on train+test jointly
    n_ta = len(train_X)
    n_ts = len(test_X)
    print('train len vs. test len', n_ta, n_ts)

    textraw = [line.encode('utf-8') for line in train_X+test_X]  # keras needs str
    # keras deals with tokens
    token = Tokenizer(nb_words=nb_words)
    token.fit_on_texts(textraw)
    textseq = token.texts_to_sequences(textraw)

    # stat about textlist
    print('nb_words: ', len(token.word_counts))
    print('mean len: ', np.mean([len(x) for x in textseq]))

    train_X = textseq[0:n_ta]
    test_X = textseq[n_ta:]

    if(embd_type == 'self'):
        X_train = sequence.pad_sequences(train_X, maxlen=maxlen, padding='post', truncating='post')
        X_test = sequence.pad_sequences(test_X, maxlen=maxlen, padding='post', truncating='post')
    elif(embd_type == 'w2v'):
        w2v = load_w2v('data/Google_w2v.bin')
        print("loaded Google word2vec")
        X_train = sents_3dtensor(train_X_wds, maxlen, w2v)
        X_test = sents_3dtensor(test_X_wds, maxlen, w2v)
    else:
        print('wrong embd_type')

    print('X tensor shape: ', X_train.shape)
    print('Y tensor shape: ', Y_train.shape)
    return(X_train, Y_train, X_test, Y_test, nb_classes)
Exemple #31
0
def evaluate(model):
    print("Processing", QUESTION_PAIRS_FILE)

    question1 = []
    question2 = []
    is_duplicate = []
    with open(QUESTION_PAIRS_FILE, encoding='utf-8') as jsondata:
        file = json.load(jsondata)
        for row in file:
            if row['is_duplicate'] != 0 and row['is_duplicate'] != 1:
                pass
            else:
                question1.append(row['question1'])
                question2.append(row['question2'])
                is_duplicate.append(row['is_duplicate'])

    print('Question pairs: %d' % len(question1))
    questions = question1 + question2
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(questions)
    question1_word_sequences = tokenizer.texts_to_sequences(question1)
    question2_word_sequences = tokenizer.texts_to_sequences(question2)

    q1_data = pad_sequences(question1_word_sequences,
                            maxlen=MAX_SEQUENCE_LENGTH)
    q2_data = pad_sequences(question2_word_sequences,
                            maxlen=MAX_SEQUENCE_LENGTH)

    X = np.stack((q1_data, q2_data), axis=1)
    y = np.array(is_duplicate, dtype=int)
    Q1_test = X[:, 0]
    Q2_test = X[:, 1]
    results = model.predict([Q1_test, Q2_test], batch_size=32, verbose=0)

    #loss, accuracy = model.evaluate([Q1_test, Q2_test], y, verbose=0)
    #print('Test loss = {0:.4f}, test accuracy = {1:.4f}'.format(loss, accuracy))

    print("Finishing predict")
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for i in range(len(is_duplicate)):
        if i % 10000 == 0: print(i)

        if round(results[i][0]) == 1 and is_duplicate[i] == 1:
            TP += 1
        elif round(results[i][0]) == 0 and is_duplicate[i] == 0:
            TN += 1
        elif round(results[i][0]) == 0 and is_duplicate[i] == 1:
            FN += 1
        elif round(results[i][0]) == 1 and is_duplicate[i] == 0:
            FP += 1

    N = len(is_duplicate)
    accuracy = (TP + TN) / N
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1 = (2 * precision * recall) / (precision + recall)
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 score: ", f1)
Exemple #32
0
class textgenrnn:
    META_TOKEN = '<s>'
    config = {
        'rnn_layers': 2,
        'rnn_size': 128,
        'rnn_bidirectional': False,
        'max_length': 40,
        'max_words': 10000,
        'dim_embeddings': 100,
        'word_level': False,
        'single_text': False
    }
    default_config = config.copy()

    def __init__(self,
                 weights_path=None,
                 vocab_path=None,
                 config_path=None,
                 name="textgenrnn"):

        if weights_path is None:
            weights_path = resource_filename(__name__,
                                             'textgenrnn_weights.hdf5')

        if vocab_path is None:
            vocab_path = resource_filename(__name__, 'textgenrnn_vocab.json')

        if config_path is not None:
            with open(config_path, 'r', encoding='utf8',
                      errors='ignore') as json_file:
                self.config = json.load(json_file)

        self.config.update({'name': name})
        self.default_config.update({'name': name})

        with open(vocab_path, 'r', encoding='utf8',
                  errors='ignore') as json_file:
            self.vocab = json.load(json_file)

        self.tokenizer = Tokenizer(filters='', lower=False, char_level=True)
        self.tokenizer.word_index = self.vocab
        self.num_classes = len(self.vocab) + 1
        self.model = textgenrnn_model(self.num_classes,
                                      cfg=self.config,
                                      weights_path=weights_path)
        self.indices_char = dict((self.vocab[c], c) for c in self.vocab)

    def generate(self,
                 n=1,
                 return_as_list=False,
                 prefix=None,
                 temperature=0.5,
                 max_gen_length=300,
                 interactive=False,
                 top_n=3):
        gen_texts = []
        for _ in range(n):
            gen_text = textgenrnn_generate(
                self.model, self.vocab, self.indices_char, prefix, temperature,
                self.config['max_length'],
                self.META_TOKEN, self.config['word_level'],
                self.config.get('single_text',
                                False), max_gen_length, interactive, top_n)
            if not return_as_list:
                print("{}\n".format(gen_text))
            gen_texts.append(gen_text)
        if return_as_list:
            return gen_texts

    def generate_samples(self, n=3, temperatures=[0.2, 0.5, 1.0], **kwargs):
        for temperature in temperatures:
            print('#' * 20 + '\nTemperature: {}\n'.format(temperature) +
                  '#' * 20)
            self.generate(n, temperature=temperature, **kwargs)

    def train_on_texts(self,
                       texts,
                       context_labels=None,
                       batch_size=128,
                       num_epochs=50,
                       verbose=1,
                       new_model=False,
                       gen_epochs=1,
                       train_size=1.0,
                       max_gen_length=300,
                       validation=True,
                       dropout=0.0,
                       via_new_model=False,
                       save_epochs=0,
                       multi_gpu=False,
                       **kwargs):

        if new_model and not via_new_model:
            self.train_new_model(texts,
                                 context_labels=context_labels,
                                 num_epochs=num_epochs,
                                 gen_epochs=gen_epochs,
                                 batch_size=batch_size,
                                 dropout=dropout,
                                 validation=validation,
                                 save_epochs=save_epochs,
                                 multi_gpu=multi_gpu,
                                 **kwargs)
            return

        if context_labels:
            context_labels = LabelBinarizer().fit_transform(context_labels)

        if 'prop_keep' in kwargs:
            train_size = prop_keep

        if self.config['word_level']:
            texts = [text_to_word_sequence(text, filters='') for text in texts]

        # calculate all combinations of text indices + token indices
        indices_list = [
            np.meshgrid(np.array(i), np.arange(len(text) + 1))
            for i, text in enumerate(texts)
        ]
        indices_list = np.block(indices_list)

        # If a single text, there will be 2 extra indices, so remove them
        # Also remove first sequences which use padding
        if self.config['single_text']:
            indices_list = indices_list[self.config['max_length']:-2, :]

        indices_mask = np.random.rand(indices_list.shape[0]) < train_size

        if multi_gpu:
            num_gpus = len(K.tensorflow_backend._get_available_gpus())
            batch_size = batch_size * num_gpus

        gen_val = None
        val_steps = None
        if train_size < 1.0 and validation:
            indices_list_val = indices_list[~indices_mask, :]
            gen_val = generate_sequences_from_texts(texts, indices_list_val,
                                                    self, context_labels,
                                                    batch_size)
            val_steps = max(
                int(np.floor(indices_list_val.shape[0] / batch_size)), 1)

        indices_list = indices_list[indices_mask, :]

        num_tokens = indices_list.shape[0]
        assert num_tokens >= batch_size, "Fewer tokens than batch_size."

        level = 'word' if self.config['word_level'] else 'character'
        print("Training on {:,} {} sequences.".format(num_tokens, level))

        steps_per_epoch = max(int(np.floor(num_tokens / batch_size)), 1)

        gen = generate_sequences_from_texts(texts, indices_list, self,
                                            context_labels, batch_size)

        base_lr = 4e-3

        # scheduler function must be defined inline.
        def lr_linear_decay(epoch):
            return (base_lr * (1 - (epoch / num_epochs)))

        if context_labels is not None:
            if new_model:
                weights_path = None
            else:
                weights_path = "{}_weights.hdf5".format(self.config['name'])
                self.save(weights_path)

            self.model = textgenrnn_model(self.num_classes,
                                          dropout=dropout,
                                          cfg=self.config,
                                          context_size=context_labels.shape[1],
                                          weights_path=weights_path)

        model_t = self.model

        if multi_gpu:
            # Do not locate model/merge on CPU since sample sizes are small.
            parallel_model = multi_gpu_model(self.model,
                                             gpus=num_gpus,
                                             cpu_merge=False)
            parallel_model.compile(loss='categorical_crossentropy',
                                   optimizer=RMSprop(lr=4e-3, rho=0.99))

            model_t = parallel_model
            print("Training on {} GPUs.".format(num_gpus))

        model_t.fit_generator(gen,
                              steps_per_epoch=steps_per_epoch,
                              epochs=num_epochs,
                              callbacks=[
                                  LearningRateScheduler(lr_linear_decay),
                                  generate_after_epoch(self, gen_epochs,
                                                       max_gen_length),
                                  save_model_weights(self, num_epochs,
                                                     save_epochs)
                              ],
                              verbose=verbose,
                              max_queue_size=10,
                              validation_data=gen_val,
                              validation_steps=val_steps)

        # Keep the text-only version of the model if using context labels
        if context_labels is not None:
            self.model = Model(inputs=self.model.input[0],
                               outputs=self.model.output[1])

    def train_new_model(self,
                        texts,
                        context_labels=None,
                        num_epochs=50,
                        gen_epochs=1,
                        batch_size=128,
                        dropout=0.0,
                        validation=True,
                        save_epochs=0,
                        multi_gpu=False,
                        **kwargs):
        self.config = self.default_config.copy()
        self.config.update(**kwargs)

        print("Training new model w/ {}-layer, {}-cell {}LSTMs".format(
            self.config['rnn_layers'], self.config['rnn_size'],
            'Bidirectional ' if self.config['rnn_bidirectional'] else ''))

        # If training word level, must add spaces around each punctuation.
        # https://stackoverflow.com/a/3645946/9314418

        if self.config['word_level']:
            punct = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\\n\\t\'‘’“”’–—'
            for i in range(len(texts)):
                texts[i] = re.sub('([{}])'.format(punct), r' \1 ', texts[i])
                texts[i] = re.sub(' {2,}', ' ', texts[i])

        # Create text vocabulary for new texts
        # if word-level, lowercase; if char-level, uppercase
        self.tokenizer = Tokenizer(filters='',
                                   lower=self.config['word_level'],
                                   char_level=(not self.config['word_level']))
        self.tokenizer.fit_on_texts(texts)

        # Limit vocab to max_words
        max_words = self.config['max_words']
        self.tokenizer.word_index = {
            k: v
            for (k, v) in self.tokenizer.word_index.items() if v <= max_words
        }

        if not self.config.get('single_text', False):
            self.tokenizer.word_index[self.META_TOKEN] = len(
                self.tokenizer.word_index) + 1
        self.vocab = self.tokenizer.word_index
        self.num_classes = len(self.vocab) + 1
        self.indices_char = dict((self.vocab[c], c) for c in self.vocab)

        # Create a new, blank model w/ given params
        self.model = textgenrnn_model(self.num_classes,
                                      dropout=dropout,
                                      cfg=self.config)

        # Save the files needed to recreate the model
        with open('{}_vocab.json'.format(self.config['name']),
                  'w',
                  encoding='utf8') as outfile:
            json.dump(self.tokenizer.word_index, outfile, ensure_ascii=False)

        with open('{}_config.json'.format(self.config['name']),
                  'w',
                  encoding='utf8') as outfile:
            json.dump(self.config, outfile, ensure_ascii=False)

        self.train_on_texts(texts,
                            new_model=True,
                            via_new_model=True,
                            context_labels=context_labels,
                            num_epochs=num_epochs,
                            gen_epochs=gen_epochs,
                            batch_size=batch_size,
                            dropout=dropout,
                            validation=validation,
                            save_epochs=save_epochs,
                            multi_gpu=multi_gpu,
                            **kwargs)

    def save(self, weights_path="textgenrnn_weights_saved.hdf5"):
        self.model.save_weights(weights_path)

    def load(self, weights_path):
        self.model = textgenrnn_model(self.num_classes,
                                      cfg=self.config,
                                      weights_path=weights_path)

    def reset(self):
        self.config = self.default_config.copy()
        self.__init__(name=self.config['name'])

    def train_from_file(self,
                        file_path,
                        header=True,
                        delim="\n",
                        new_model=False,
                        context=None,
                        is_csv=False,
                        **kwargs):

        context_labels = None
        if context:
            texts, context_labels = textgenrnn_texts_from_file_context(
                file_path)
        else:
            texts = textgenrnn_texts_from_file(file_path, header, delim,
                                               is_csv)

        print("{:,} texts collected.".format(len(texts)))
        if new_model:
            self.train_new_model(texts,
                                 context_labels=context_labels,
                                 **kwargs)
        else:
            self.train_on_texts(texts, context_labels=context_labels, **kwargs)

    def train_from_largetext_file(self, file_path, new_model=True, **kwargs):
        with open(file_path, 'r', encoding='utf8', errors='ignore') as f:
            texts = [f.read()]

        if new_model:
            self.train_new_model(texts, single_text=True, **kwargs)
        else:
            self.train_on_texts(texts, single_text=True, **kwargs)

    def generate_to_file(self, destination_path, **kwargs):
        texts = self.generate(return_as_list=True, **kwargs)
        with open(destination_path, 'w') as f:
            for text in texts:
                f.write("{}\n".format(text))

    def encode_text_vectors(self,
                            texts,
                            pca_dims=50,
                            tsne_dims=None,
                            tsne_seed=None,
                            return_pca=False,
                            return_tsne=False):

        # if a single text, force it into a list:
        if isinstance(texts, str):
            texts = [texts]

        vector_output = Model(inputs=self.model.input,
                              outputs=self.model.get_layer('attention').output)
        encoded_vectors = []
        maxlen = self.config['max_length']
        for text in texts:
            if self.config['word_level']:
                text = text_to_word_sequence(text, filters='')
            text_aug = [self.META_TOKEN] + list(text[0:maxlen])
            encoded_text = textgenrnn_encode_sequence(text_aug, self.vocab,
                                                      maxlen)
            encoded_vector = vector_output.predict(encoded_text)
            encoded_vectors.append(encoded_vector)

        encoded_vectors = np.squeeze(np.array(encoded_vectors), axis=1)
        if pca_dims is not None:
            assert len(texts) > 1, "Must use more than 1 text for PCA"
            pca = PCA(pca_dims)
            encoded_vectors = pca.fit_transform(encoded_vectors)

        if tsne_dims is not None:
            tsne = TSNE(tsne_dims, random_state=tsne_seed)
            encoded_vectors = tsne.fit_transform(encoded_vectors)

        return_objects = encoded_vectors
        if return_pca or return_tsne:
            return_objects = [return_objects]
        if return_pca:
            return_objects.append(pca)
        if return_tsne:
            return_objects.append(tsne)

        return return_objects

    def similarity(self, text, texts, use_pca=True):
        text_encoded = self.encode_text_vectors(text, pca_dims=None)
        if use_pca:
            texts_encoded, pca = self.encode_text_vectors(texts,
                                                          return_pca=True)
            text_encoded = pca.transform(text_encoded)
        else:
            texts_encoded = self.encode_text_vectors(texts, pca_dims=None)

        cos_similairity = cosine_similarity(text_encoded, texts_encoded)[0]
        text_sim_pairs = list(zip(texts, cos_similairity))
        text_sim_pairs = sorted(text_sim_pairs, key=lambda x: -x[1])
        return text_sim_pairs
for s, l in train_data:
    training_sentences.append(str(s.numpy()))
    training_labels.append(l.numpy())
for s, l in test_data:
    test_sentences.append(str(s.numpy()))
    test_labels.append(l.numpy())
training_labels_final = np.array(training_labels)
test_labels_final = np.array(test_labels)

vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(test_sentences)
testing_padded = pad_sequences(testing_sequences,
                               maxlen=max_length,
                               truncating=trunc_type)

reverse_word_index = dict([(value, key)
                           for (key, value) in word_index.items()])


def decode_review(text):
Exemple #34
0
    def __init__(self, config: Configuration) -> None:
        ##初始化 位置向量矩阵 :max_sequence_length * 20
        self.config = config
        self.word_segmentor = config.word_segmentor
        self.MAX_SEQUENCE_LENGTH = config.MAX_SEQUENCE_LENGTH
        self.EMBEDDING_DIM = config.EMBEDDING_DIM
        self.MAX_NB_WORDS = config.MAX_NB_WORDS
        if not os.path.isfile(config.position_matrix_file_path):
            position_matrix = np.random.randn(config.MAX_SEQUENCE_LENGTH, 20)
            np.save(config.position_matrix_file_path[0:-4], position_matrix)
        self.position_matrix = np.load(config.position_matrix_file_path)
        print("位置向量矩阵的大小", self.position_matrix.shape)

        ##初始化 tokenizer 转化文本为sequence
        default_model = {}
        default_model = tw_w2v.get_word2vec_dic(config.word2vec_file_path)
        self.tokenizer = Tokenizer(num_words=config.MAX_NB_WORDS)  #
        words = None
        if isinstance(default_model, dict):
            words = default_model.keys()
        else:
            words = default_model.vocab.keys()

        self.tokenizer.fit_on_texts(words)
        word_index = self.tokenizer.word_index
        self.num_words = min(config.MAX_NB_WORDS, len(word_index) + 1)
        ##初始化词向量,词向量矩阵 : 50000*64
        model_dim = 0
        for key in words:
            model_dim = default_model[key].shape[0]
            break
        if self.EMBEDDING_DIM != model_dim:
            print("WARN ! 设置的词向量与读取维数不同,默认采用读取的词向量维数。", self.EMBEDDING_DIM,
                  model_dim)
            self.EMBEDDING_DIM = model_dim
            config.EMBEDDING_DIM = model_dim
        self.embedding_matrix = np.zeros(
            (self.num_words, config.EMBEDDING_DIM))
        for word, i in word_index.items():
            if i >= config.MAX_NB_WORDS:
                continue
            embedding_vector = default_model[word]
            if embedding_vector is not None:
                # 文本数据中的词在词向量字典中没有,向量为取0;如果有则取词向量中该词的向量
                try:
                    self.embedding_matrix[i] = embedding_vector
                except Exception as e:
                    print(e)
            else:
                print("warn! ", word, "不在词向量列表")
        print("词向量矩阵的大小", self.embedding_matrix.shape)

        ##初始化词性标注List
        self.POS_list = []
        if os.path.exists(config.POS_list_file_path):
            with open(config.POS_list_file_path, encoding="UTF-8") as f:
                for line in f.readlines():
                    if len(line.strip()) > 0:
                        self.POS_list.append(line.strip())
        else:
            file_types = []
            file_sentences = []
            with open(config.corpus_file_path, 'r', encoding="UTF-8") as f:
                for line in f.readlines():
                    file_types.append(line.split("|")[0].strip())
                    file_sentences.append(line.split("|")[1].strip())

            all_pos_set = set(self.POS_list)
            wordPairList_allSen, entityPosition_allSen = self.word_segmentor.segListWithNerTag(
                file_sentences)
            for pairs in wordPairList_allSen:
                for pair in pairs:
                    if not all_pos_set.__contains__(pair.flag):
                        self.POS_list.append(pair.flag)
                        all_pos_set.add(pair.flag)
            with open(config.POS_list_file_path, "w", encoding="UTF-8") as f:
                for pos in self.POS_list:
                    f.write(pos)
                    f.write("\n")
        print("POS类型", len(self.POS_list))

        ##关系种类,RelationWordAdmin有relations和relation_word_dic
        self.relationWordAdmin = RelationWordAdmin(config.types_file_path)
        self.types = self.relationWordAdmin.relations
        print("分类类型", len(self.types))
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from keras.layers.embeddings import Embedding
from keras.layers import Flatten


twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
y = twenty_train.target
sentences = twenty_train.data

max_review_len = max([len(s.split()) for s in sentences])


tokenizer = Tokenizer(num_words=max_review_len)
tokenizer.fit_on_texts(sentences)

sentences = tokenizer.texts_to_matrix(sentences)

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

# Build model
model = Sequential()
model.add(layers.Dense(300, input_dim=max_review_len, activation='relu'))
model.add(layers.Dense(20,activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=5, verbose=True, validation_data=(X_test, y_test), batch_size=256)
from sklearn.model_selection import GridSearchCV

from keras.preprocessing.text import Tokenizer

vocab_size = 3000
batch_size = 32
epochs = 5

(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=vocab_size,
                                                         test_split=0.2)
print(f"x_train shape: {x_train.shape}\nx_test shape: {x_test.shape}")

classes = np.max(y_train) + 1

print("Vectorizing data . . . ")
tokenizer = Tokenizer(num_words=vocab_size)
x_train = tokenizer.sequences_to_matrix(x_train, mode="binary")
x_test = tokenizer.sequences_to_matrix(x_test, mode="binary")

y_train = keras.utils.to_categorical(y_train, num_classes=classes)
y_test = keras.utils.to_categorical(y_test, num_classes=classes)
print(f"y_train shape: {y_train.shape}\n y_test shape: {y_test.shape}")


def make_model(activator="relu",
               alpha=0.3,
               optimizer="sgd",
               dense_layer_size=32,
               num_layers=3,
               dropout_rate=0.1,
               loss="categorical_crossentropy",
model.fit(X_train_nhot, y_train,validation_split=0.1 , epochs=15, verbose=1, batch_size=128)
loss, accuracy = model.evaluate(X_test_nhot,y_test)
EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=2,
                              verbose=0, mode='auto')
print("Accuracy: ", accuracy *100)




feed_forward = model.predict_classes(X_test_nhot, verbose=1)



t = Tokenizer()
t.fit_on_texts(X_train)
vocab_size = len(t.word_index) + 1
encoded_docs = t.texts_to_sequences(X_train)
encoded_doc = t.texts_to_sequences(X_test)
num_classes = len(np.unique(y_train)) # how many labels we have
y_train_one_hot = y_train
y_test_one_hot = y_test
y_dev_one_hot = y_dev


# NEURAL NETWORK WITH DEEPLEARNING

w2i = defaultdict(lambda: len(w2i))
PAD = w2i["<pad>"] # index 0 is padding
UNK = w2i["<unk>"] # index 1 is for UNK
Exemple #38
0
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

np.random.seed(1337)  # for reproducibility

print("Reading pkl PORTUGUESE")
start = time.time()
train_data = pd.read_pickle('./data/train_subset_portuguese.pkl')
test_data = pd.read_pickle('./data/test_subset_portuguese.pkl')
stop = time.time()
print(stop - start)

t = Tokenizer()
print("Loading the tokenizer")
with open('./tokenizer/tokenizer_portuguese.pickle', 'rb') as handle:
    t = pickle.load(handle)

print("Summary portuguese Train and test size: ")
print(train_data.shape)
print(test_data.shape)

# Creating train data set
encoded_seqs = t.texts_to_sequences(train_data['text_cleaned'])
encoded_seqs_dummy = t.texts_to_sequences(test_data['text_cleaned'])

s1 = np.max([len(item) for item in encoded_seqs])
s2 = np.max([len(item) for item in encoded_seqs_dummy])
s3 = np.max([s1, s2])
Exemple #39
0
class CharModel(BaseModel):
    def __init__(self,
                 vocab_size=200,
                 max_charlen=250,
                 tokenize_args={},
                 embedding_dim=64,
                 filters=128,
                 kernel_size=7,
                 pooling_size=3,
                 recursive_class=LSTM,
                 recursive_units=128,
                 dense_units=64,
                 dropout=[0.75, 0.50],
                 **kwargs):

        self._max_charlen = max_charlen
        self._vocab_size = vocab_size
        self._char_tokenizer = KerasTokenizer(num_words=vocab_size,
                                              char_level=True)

        # Build the graph
        input_char = Input(shape=(max_charlen, ), name="Char_Input")
        x = Embedding(vocab_size, embedding_dim)(input_char)
        x = Conv1D(filters=filters,
                   kernel_size=kernel_size,
                   padding='same',
                   activation='relu')(x)

        x = MaxPooling1D(pool_size=pooling_size)(x)
        x = Bidirectional(recursive_class(recursive_units))(x)
        if dropout[0] > 0:
            x = Dropout(dropout[0])(x)
        x = Dense(dense_units, activation='relu')(x)
        if dropout[1] > 0:
            x = Dropout(dropout[1])(x)
        output = Dense(1, activation='sigmoid')(x)

        tok_args = {
            "preserve_case": False,
            "deaccent": True,
            "reduce_len": True,
            "strip_handles": False,
            "stem": True,
            "alpha_only": False
        }

        tok_args.update(tokenize_args)

        super().__init__(inputs=[input_char],
                         outputs=[output],
                         tokenize_args=tok_args,
                         **kwargs)

    def _preprocess_text(self, X):
        tokens = map(self._tokenizer.tokenize, X)
        instances = [" ".join(seq_tokens) for seq_tokens in tokens]

        return instances

    def preprocess_fit(self, X):
        text_train = self._preprocess_text(X)

        self._char_tokenizer.fit_on_texts(text_train)

    def preprocess_transform(self, X):
        X_transf = self._preprocess_text(X)
        X_transf = self._char_tokenizer.texts_to_sequences(X_transf)

        return pad_sequences(X_transf, self._max_charlen)
# The RNN is an expressive model that is known to learn highly complex relationships from an arbitrarily long sequence of data. It maintains a vector of activation units for each element in the data sequence, this makes RNN very deep. The depth of RNN leads to two well-known issues, the exploding and the vanishing gradient problems. 
# 
# There are many ways to implement nueral network in python. Here, I will be using tensorflow/keras.

# In[110]:


# Importing the libraries
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


# In[111]:


tk = Tokenizer(lower = True)
tk.fit_on_texts(X)
X_seq = tk.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=100, padding='post') 


# In[112]:


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size = 0.25, random_state = 1)


# In[113]:

test_texts_1 = []
test_texts_2 = []
test_ids = []


def get_test_text(row):
    global test_texts_1, test_texts_2, test_ids
    test_texts_1.append(row.question1)
    test_texts_2.append(row.question2)
    test_ids.append(row.test_id)


test.apply(get_test_text, axis=1)
print('Found %s texts in test.csv' % len(test_texts_1))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)
Exemple #42
0
    BATCH_SIZE = 64
    NB_EPOCHS = 25

    # 读取训练和测试数据
    x_train, y_train = read_txt(train_path, mode="train")
    x_test = read_txt(test_path, mode="predict")
    print(
        u"length of train data is {0}, length of train label is {1}, length of test data is {2}"
        .format(len(x_train), len(y_train), len(x_test)))

    # 生成corpus
    corpus = x_train.tolist() + x_test.tolist()
    print("length of corpus is {0}".format(len(corpus)))

    # 顺序进行tokenizer,fit,txt2sequence,pad-sequence
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(corpus)
    print(u"fit_on_texts finished")
    sequences = tokenizer.texts_to_sequences(x_train)
    test_sequences = tokenizer.texts_to_sequences(x_test)
    print("tokenizer finished")
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', y_train.shape)
    test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Shape of test_data tensor:', test_data.shape)

    word_index = tokenizer.word_index
    print('Found %s unique tokens' % len(word_index))

    # 将数据的id切分train和val
Exemple #43
0
    return string.strip().lower()


data = pandas.read_csv('./data.csv')  #默认以逗号为分隔符

texts = [clean_text(text.encode('ascii', 'ignore')) for text in data.review]
labels = to_categorical(np.asarray(list(data.sentiment)))

embeddings_dict = {}
with open('data_50.txt', 'r', encoding='utf-8') as f:
    for data_word in f:
        word = data_word.split()[0]
        arra = np.asarray(data_word.split()[1:], dtype='float32')
        embeddings_dict[word] = arra

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=1000)

data = data[np.arange(MAX_NUM)]
labels = labels[np.arange(MAX_NUM)]

x_train = data[:24999]
y_train = labels[:24999]
x_test = data[25000:]
y_test = labels[25000:]

embedding_matrix = np.random.random((len(word_index) + 1, 50))
for word, i in word_index.items():
    print("Processing", QUESTION_PAIRS_FILE)

    question1 = []
    question2 = []
    is_duplicate = []
    with open(KERAS_DATASETS_DIR + QUESTION_PAIRS_FILE, encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile, delimiter='\t')
        for row in reader:
            question1.append(row['question1'])
            question2.append(row['question2'])
            is_duplicate.append(row['is_duplicate'])

    print('Question pairs: %d' % len(question1))

    questions = question1 + question2
    tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(questions)
    question1_word_sequences = tokenizer.texts_to_sequences(question1)
    question2_word_sequences = tokenizer.texts_to_sequences(question2)
    word_index = tokenizer.word_index

    print("Words in index: %d" % len(word_index))

    if not exists(KERAS_DATASETS_DIR + GLOVE_ZIP_FILE):
        zipfile = ZipFile(get_file(GLOVE_ZIP_FILE, GLOVE_ZIP_FILE_URL))
        zipfile.extract(GLOVE_FILE, path=KERAS_DATASETS_DIR)

    print("Processing", GLOVE_FILE)

    embeddings_index = {}
    with open(KERAS_DATASETS_DIR + GLOVE_FILE, encoding='utf-8') as f:
def tokenize(x):
    x_tkzr = Tokenizer(char_level=False)
    x_tkzr.fit_on_texts(x)
    return x_tkzr.texts_to_sequences(x), x_tkzr
Exemple #46
0
    for t in os.listdir(path):
        tweet_count += 1
        p2 = os.path.join(path, t)
        f = open(p2, "r")
        texts_.append(f.read())
        labels_.append(x)
        f.close()
    print(x, '. Klasör tweet sayısı :', tweet_count)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from keras.utils import to_categorical

token = Tokenizer()
token.fit_on_texts(texts_)
texts_ = token.texts_to_sequences(texts_)
texts_ = pad_sequences(texts_)

texts_ = StandardScaler().fit_transform(texts_)

labels_ = preprocessing.LabelEncoder().fit_transform(labels_)
labels_ = to_categorical(labels_)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test =\
     train_test_split(texts_, labels_, test_size = 0.5)

max_futures = 1500
maxlen = 28
Exemple #47
0
def get_padded_sequences(titles: pd.Series, tokenizer: Tokenizer) -> np.array:
    sequences = tokenizer.texts_to_sequences(titles)
    padded_sequences = pad_sequences(sequences,
                                     maxlen=config.MAX_SEQUENCE_LENGTH)
    return padded_sequences
Exemple #48
0
q1 = train['0'].tolist()
q2 = train['1'].tolist()

test['0'] = test['0'].progress_apply(
    lambda x: re.sub(r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?', "", str(x)))
test['1'] = test['1'].progress_apply(
    lambda x: re.sub(r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?', "", str(x)))

q1_test = test['0'].tolist()
q2_test = test['1'].tolist()

labels = train['is_duplicate'].tolist()
ids = test['test_id'].tolist()

tokenizer = Tokenizer(num_words=u.max_nb_words)
tokenizer.fit_on_texts(q1 + q2 + q1_test + q2_test)

sequences_1 = tokenizer.texts_to_sequences(q1)
sequences_2 = tokenizer.texts_to_sequences(q2)

test_sequences_1 = tokenizer.texts_to_sequences(q1_test)
test_sequences_2 = tokenizer.texts_to_sequences(q2_test)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))
data_1 = pad_sequences(sequences_1, maxlen=u.seq_length)
data_2 = pad_sequences(sequences_2, maxlen=u.seq_length)
labels = np.array(labels)

print("Elapsed time till loading word vectors", time() - start)
Exemple #49
0
def main():
    # columns = ['class', 'title', 'u1', 'authors', 'source', 'publisher', 'citations', 'abstract', 'keywords']
    data = pd.read_csv(TRAIN_FILE)
    data.dropna(subset=['class', 'title', 'abstract'])
    print('Rows of data: {}'.format(len(data)))
    label_count = data['class'].nunique()
    print('Unique labels: {}'.format(label_count))
    labels = np.array(data['class'])
    titles = list(data['title'])
    abstracts = list(data['abstract'])
    if DO_LOWER_CASE:
        sentences = [
            titles[i].lower() + ' ' + abstracts[i].lower()
            for i in range(len(titles))
        ]
    else:
        sentences = [
            titles[i] + ' ' + abstracts[i] for i in range(len(titles))
        ]
    if DO_REMOVE_PUNCTUATION:
        sentences = [
            re.sub(r'[~`!@#$%^&*()_\-+={}\[\]|\\:;"\'<>,.?/]+', ' ', sentence)
            for sentence in sentences
        ]
    if DO_REMOVE_STOP_WORDS:
        sentences = remove_stop_words(sentences)
    lines = np.array(sentences)
    # Shuffle the labels and lines.
    permutation = np.random.permutation(labels.shape[0])
    labels = labels[permutation]
    lines = lines[permutation]
    # Split the data for training and testing.
    train_ratio = 0.8
    val_ratio = 0.25  # Proportion of the TRAINING data, not of the entire data set.
    train_end = int(train_ratio * len(lines))
    train_labels = labels[:train_end]
    train_lines = lines[:train_end]
    test_labels = labels[train_end:]
    test_lines = lines[train_end:]
    # Print class spreads in each data set.
    val_start = int((1 - val_ratio) * train_end)
    train_spread = get_spread(train_labels[:val_start])
    test_spread = get_spread(test_labels)
    val_spread = get_spread(train_labels[val_start:])
    longest_label = max(len(label) for label in train_spread.keys())
    for label in train_spread.keys():
        print('{}{}: TR:{} VAL:{} TS:{}'.format(
            label, ''.join(' ' for _ in range(longest_label - len(label))),
            train_spread[label] / val_start,
            val_spread[label] / (train_end - val_start),
            test_spread[label] / len(test_labels)))
    # Pre-process; tokenize the text and transform it into [padded] sequences for the RNN.
    # See: https://www.kaggle.com/sbongo/for-beginners-tackling-toxic-using-keras
    # See: https://www.kaggle.com/jhoward/improved-lstm-baseline-glove-dropout
    max_features = 8000  # The maximum number of total unique words to use.
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(train_lines)
    # tokenizer.fit_on_texts(test_lines)  # Can we do this?
    train_word_counts = tokenizer.texts_to_matrix(train_lines, mode='count')
    test_word_counts = tokenizer.texts_to_matrix(test_lines, mode='count')
    max_word_counts = np.array([
        max(train_word_counts[:, col])
        for col in range(len(train_word_counts[0]))
    ])
    min_word_counts = np.array([
        min(train_word_counts[:, col])
        for col in range(len(train_word_counts[0]))
    ])
    x_train = np.nan_to_num((train_word_counts - min_word_counts) /
                            (max_word_counts - min_word_counts))
    x_test = np.nan_to_num((test_word_counts - min_word_counts) /
                           (max_word_counts - min_word_counts))
    # Transform the labels into a one-hot encoding.
    encoder = LabelBinarizer()
    y_train = encoder.fit_transform(train_labels)
    y_test = encoder.fit_transform(test_labels)
    # Build the model.
    model = get_model(max_features, label_count)
    print(model.summary())
    optimizer = 'adam'
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['categorical_accuracy'])
    history = model.fit(x_train,
                        y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_split=val_ratio)
    # Save the model and the history.
    save_str = 'dense1-{}{}{}{}{}-{}-{}'.format(
        max_features, '-lower' if DO_LOWER_CASE else '',
        '-nopunc' if DO_REMOVE_PUNCTUATION else '',
        '-nostop' if DO_REMOVE_STOP_WORDS else '',
        '-stem' if DO_STEMMING else '', optimizer, epochs)
    print('Saving model `{}.h5`...'.format(save_str))
    model.save(os.path.join(MODELS_FOLDER, '{}.h5'.format(save_str)))
    print('Saving training history `history-{}.txt`...'.format(save_str))
    with open(os.path.join(LOGS_FOLDER, 'history-{}.txt'.format(save_str)),
              'w') as fd:
        for key in history.history.keys():
            values = history.history.get(key)
            fd.write(key + ' ' + ' '.join(str(value)
                                          for value in values) + '\n')
    # Test.
    # predictions = model.predict([test_sequences], batch_size=1024, verbose=1)
    test_loss, test_accuracy = model.evaluate(x_test,
                                              y_test,
                                              batch_size=1024,
                                              verbose=1)
    print('Test loss:', test_loss)
    print('Test accuracy:', test_accuracy)
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
def keras_classify(df):
    # 预处理,把 text 中的词转成数字编号
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing import sequence
    from keras.callbacks import EarlyStopping
    from sklearn.cross_validation import train_test_split

    print "----- Classification by Keras -----"
    max_features = 50000  # 只选最重要的词
    # Tokenizer 只能处理 str,不能处理 unicode
    textraw = map(lambda x: x.encode('utf-8'), df.seg_word.values.tolist())
    token = Tokenizer(nb_words=max_features)
    # 由于 df.seg_word 以空格相隔,故此这里 Tokenizer 直接按英文方式处理 str 即可完成分词
    token.fit_on_texts(textraw)
    # token 中记录了每个词的编号和出现次数,这里使用词编号来代替 textraw 中的词文本
    # 如 textraw = ['a b c', 'c d e f']  ==> text_seq = [[1, 2, 3], [3, 4, 5, 6]]
    text_seq = token.texts_to_sequences(textraw)
    nb_classes = len(np.unique(df.label.values))
    print "num of features(vocabulary): ", len(token.word_counts)
    print "num of labels: ", nb_classes
    max_sent_len = np.max([len(s) for s in text_seq])
    print "max length or document is: ", max_sent_len
    median_sent_len = np.median([len(s) for s in text_seq])
    print "median length or document is: ", median_sent_len

    # 这里的 df.label.values 中 values 不能忽略,否则后面 np_utils.to_categorical 时会出错
    train_X, test_X, train_y, test_y = train_test_split(text_seq,
                                                        df.label.values,
                                                        train_size=0.7,
                                                        random_state=1)
    # 目前 train_X & test_X 仍然不是等长的,其每行都是一个 document,需要化为等长的矩阵才能训练
    seqlen = int(max_sent_len / 2 + median_sent_len / 2)
    X_train = sequence.pad_sequences(train_X,
                                     maxlen=seqlen,
                                     padding='post',
                                     truncating='post')
    X_test = sequence.pad_sequences(test_X,
                                    maxlen=seqlen,
                                    padding='post',
                                    truncating='post')
    # 把 y 格式展开为 one-hot,目的是在 nn 的最后采用 softmax
    Y_train = np_utils.to_categorical(train_y, nb_classes)
    Y_test = np_utils.to_categorical(test_y, nb_classes)

    model = build_cnn_model(max_features, seqlen, nb_classes)
    earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)
    # 训练 10 轮,每轮 mini_batch 为 32,训练完调用 earlystop 查看是否已经 ok
    model.fit(X_train,
              Y_train,
              batch_size=32,
              nb_epoch=10,
              validation_split=0.1,
              callbacks=[earlystop])
    evaluate(earlystop.model, X_test, Y_test, test_y)

    model = build_lstm_model(max_features, seqlen, nb_classes)
    model.fit(X_train,
              Y_train,
              batch_size=32,
              nb_epoch=1,
              validation_split=0.1)
    evaluate(model, X_test, Y_test, test_y)

    model = build_mixed_model(max_features, seqlen, nb_classes)
    model.fit(X_train,
              Y_train,
              batch_size=32,
              nb_epoch=1,
              validation_split=0.1)
    evaluate(model, X_test, Y_test, test_y)

    graph = build_graph_model(max_features, seqlen, nb_classes)
    graph.fit({
        'input': X_train,
        'output': Y_train
    },
              nb_epoch=3,
              batch_size=32,
              validation_split=0.1)
    predict = graph.predict({'input': X_test}, batch_size=32)
    predict = predict['output']
    classes = predict.argmax(axis=1)
    acc = np_utils.accuracy(classes, test_y)
    print('Test accuracy: ', acc)
class CaptionPreprocessor(object):
    """Preprocesses captions before feeded into the network."""

    EOS_TOKEN = 'zeosz'

    def __init__(self, rare_words_handling=None, words_min_occur=None):
        """
        If an arg is None, it will get its value from config.active_config.
        Args:
          rare_words_handling: {'nothing'|'discard'|'change'}
          words_min_occur: words whose occurrences are less than this are
                           considered rare words
        """
        self._tokenizer = Tokenizer()
        self._rare_words_handling = (rare_words_handling or
                                     active_config().rare_words_handling)
        self._words_min_occur = (words_min_occur or
                                 active_config().words_min_occur)
        self._word_of = {}

    @property
    def EOS_TOKEN_LABEL_ENCODED(self):
        return self._tokenizer.word_index[self.EOS_TOKEN]

    @property
    def vocabs(self):
        word_index = self._tokenizer.word_index
        return sorted(word_index, key=word_index.get)  # Sort by word's index

    @property
    def vocab_size(self):
        return len(self._tokenizer.word_index)

    def fit_on_captions(self, captions_txt):
        captions_txt = self._handle_rare_words(captions_txt)
        captions_txt = self._add_eos(captions_txt)
        self._tokenizer.fit_on_texts(captions_txt)
        self._word_of = {i: w for w, i in self._tokenizer.word_index.items()}

    def encode_captions(self, captions_txt):
        captions_txt = self._add_eos(captions_txt)
        return self._tokenizer.texts_to_sequences(captions_txt)

    def decode_captions(self, captions_output, captions_output_expected=None):
        """
        Args
          captions_output: 3-d array returned by a model's prediction; it's the
            same as captions_output returned by preprocess_batch
        """
        captions = captions_output[:, :-1, :]  # Discard the last word (dummy)
        label_encoded = captions.argmax(axis=-1)
        num_batches, num_words = label_encoded.shape

        if captions_output_expected is not None:
            caption_lengths = self._caption_lengths(captions_output_expected)
        else:
            caption_lengths = [num_words] * num_batches

        captions_str = []
        for caption_i in range(num_batches):
            caption_str = []
            for word_i in range(caption_lengths[caption_i]):
                label = label_encoded[caption_i, word_i]
                label += 1  # Real label = label in model + 1
                caption_str.append(self._word_of[label])
            captions_str.append(' '.join(caption_str))

        return captions_str

    # TODO Test method below
    def decode_captions_from_list2d(self, captions_encoded):
        """
        Args
          captions_encoded: 1-based (Tokenizer's), NOT 0-based (model's)
        """
        captions_decoded = []
        for caption_encoded in captions_encoded:
            words_decoded = []
            for word_encoded in caption_encoded:
                # No need of incrementing word_encoded
                words_decoded.append(self._word_of[word_encoded])
            captions_decoded.append(' '.join(words_decoded))
        return captions_decoded

    def normalize_captions(self, captions_txt):
        captions_txt = self._add_eos(captions_txt)
        word_sequences = map(text_to_word_sequence, captions_txt)
        result = map(' '.join, word_sequences)
        return result

    def preprocess_batch(self, captions_label_encoded):
        captions = keras_seq.pad_sequences(captions_label_encoded,
                                           padding='post')
        # Because the number of timesteps/words resulted by the model is
        # maxlen(captions) + 1 (because the first "word" is the image).
        captions_extended1 = keras_seq.pad_sequences(captions,
                                                maxlen=captions.shape[-1] + 1,
                                                padding='post')
        captions_one_hot = map(self._tokenizer.sequences_to_matrix,
                               np.expand_dims(captions_extended1, -1))
        captions_one_hot = np.array(captions_one_hot, dtype='int')

        # Decrease/shift word index by 1.
        # Shifting `captions_one_hot` makes the padding word
        # (index=0, encoded=[1, 0, ...]) encoded all zeros ([0, 0, ...]),
        # so its cross entropy loss will be zero.
        captions_decreased = captions.copy()
        captions_decreased[captions_decreased > 0] -= 1
        captions_one_hot_shifted = captions_one_hot[:, :, 1:]

        captions_input = captions_decreased
        captions_output = captions_one_hot_shifted
        return captions_input, captions_output

    def _handle_rare_words(self, captions):
        if self._rare_words_handling == 'nothing':
            return captions
        elif self._rare_words_handling == 'discard':
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(captions)
            new_captions = []
            for caption in captions:
                words = text_to_word_sequence(caption)
                new_words = [w for w in words
                             if tokenizer.word_counts.get(w, 0) >=
                             self._words_min_occur]
                new_captions.append(' '.join(new_words))
            return new_captions

        raise NotImplementedError('rare_words_handling={} is not implemented '
                                  'yet!'.format(self._rare_words_handling))

    def _add_eos(self, captions):
        return map(lambda x: x + ' ' + self.EOS_TOKEN, captions)

    def _caption_lengths(self, captions_output):
        one_hot_sum = captions_output.sum(axis=2)
        return (one_hot_sum != 0).sum(axis=1)
Exemple #53
0
import pickle
f = open('/content/drive/My Drive/Sarcasm/Final_dataset3.p', 'rb')
d_final3 = pickle.load(f)

import pickle
f = open('/content/drive/My Drive/Sarcasm/embedding_matrix.p', 'rb')
embedding_matrix = pickle.load(f)
f = open('/content/drive/My Drive/Sarcasm/Audio_features.p', 'rb')
emb = pickle.load(f)

MAX_NB_WORDS = 40000
MAX_SEQUENCE_LENGTH = 128

text = d_final3['text']
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, oov_token=True)
tokenizer.fit_on_texts(text)
data = np.zeros((len(text), MAX_SEQUENCE_LENGTH), dtype='int32')

labels1 = []
label_index1 = {}
for label in d_final3['Sarcasm']:
    labelid = len(label_index1)
    label_index1[label] = labelid
    labels1.append(label)

print(len(labels1))
Exemple #54
0
    def train_new_model(self,
                        texts,
                        context_labels=None,
                        num_epochs=50,
                        gen_epochs=1,
                        batch_size=128,
                        dropout=0.0,
                        validation=True,
                        save_epochs=0,
                        multi_gpu=False,
                        **kwargs):
        self.config = self.default_config.copy()
        self.config.update(**kwargs)

        print("Training new model w/ {}-layer, {}-cell {}LSTMs".format(
            self.config['rnn_layers'], self.config['rnn_size'],
            'Bidirectional ' if self.config['rnn_bidirectional'] else ''))

        # If training word level, must add spaces around each punctuation.
        # https://stackoverflow.com/a/3645946/9314418

        if self.config['word_level']:
            punct = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\\n\\t\'‘’“”’–—'
            for i in range(len(texts)):
                texts[i] = re.sub('([{}])'.format(punct), r' \1 ', texts[i])
                texts[i] = re.sub(' {2,}', ' ', texts[i])

        # Create text vocabulary for new texts
        # if word-level, lowercase; if char-level, uppercase
        self.tokenizer = Tokenizer(filters='',
                                   lower=self.config['word_level'],
                                   char_level=(not self.config['word_level']))
        self.tokenizer.fit_on_texts(texts)

        # Limit vocab to max_words
        max_words = self.config['max_words']
        self.tokenizer.word_index = {
            k: v
            for (k, v) in self.tokenizer.word_index.items() if v <= max_words
        }

        if not self.config.get('single_text', False):
            self.tokenizer.word_index[self.META_TOKEN] = len(
                self.tokenizer.word_index) + 1
        self.vocab = self.tokenizer.word_index
        self.num_classes = len(self.vocab) + 1
        self.indices_char = dict((self.vocab[c], c) for c in self.vocab)

        # Create a new, blank model w/ given params
        self.model = textgenrnn_model(self.num_classes,
                                      dropout=dropout,
                                      cfg=self.config)

        # Save the files needed to recreate the model
        with open('{}_vocab.json'.format(self.config['name']),
                  'w',
                  encoding='utf8') as outfile:
            json.dump(self.tokenizer.word_index, outfile, ensure_ascii=False)

        with open('{}_config.json'.format(self.config['name']),
                  'w',
                  encoding='utf8') as outfile:
            json.dump(self.config, outfile, ensure_ascii=False)

        self.train_on_texts(texts,
                            new_model=True,
                            via_new_model=True,
                            context_labels=context_labels,
                            num_epochs=num_epochs,
                            gen_epochs=gen_epochs,
                            batch_size=batch_size,
                            dropout=dropout,
                            validation=validation,
                            save_epochs=save_epochs,
                            multi_gpu=multi_gpu,
                            **kwargs)
Exemple #55
0
MAX_SEQUENCE_LENGTH = 2000
NB_WORDS = 65
EMBEDDING_DIM = 100
embedding_matrix = np.load('embedding_matrix.npy')

# Tokenize-----------------------------
f = ['a', 'c', 'g', 't']
c = itertools.product(f, f, f, f, f, f)
res = []
for i in c:
    temp = i[0] + i[1] + i[2] + i[3] + i[4] + i[5]
    res.append(temp)
res = np.array(res)
NB_WORDS = 4097
tokenizer = Tokenizer(num_words=NB_WORDS)
tokenizer.fit_on_texts(res)
word_index = tokenizer.word_index
word_index['null'] = 0
# ------------------------------------

from keras.layers import Embedding

embedding_layer = Embedding(len(word_index),
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
Exemple #56
0
dataframe.drop('AUTHOR', axis=1, inplace=True)
dataframe.drop('DATE', axis=1, inplace=True)

# print(dataframe.head(5))
# print(numpy.unique(dataframe['CLASS']))

dataset = dataframe.values

X = dataset[:, 0]
Y = dataset[:, 1]

# Summarize number of words
print("Number of words: ")
print(len(numpy.unique(numpy.hstack(X))))

tokenizer = Tokenizer()

tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

validation_size = 0.33
seed = 7

X_train, X_validation, Y_train, Y_validation = train_test_split(
    X, Y, test_size=validation_size, random_state=seed)

X_train = sequence.pad_sequences(X_train, maxlen=120)
X_validation = sequence.pad_sequences(X_validation, maxlen=120)

# print(X_train[0])
import pandas as pd
import numpy as np
import nltk, re, time
from nltk.corpus import stopwords
from string import punctuation
from collections import namedtuple
from sklearn.datasets import load_files
from keras.preprocessing.text import Tokenizer

input_file = "./DONE/s.csv"

# Charge le csv dans une dataframe
dataset = pd.read_csv(input_file, delimiter="\t")
print(dataset.shape)

# Tokenize le corpus et print la longueur de l'index
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dataset)
word_index = tokenizer.word_index
print("Words in index: %d" % len(word_index))



Exemple #58
0
from keras.models import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# read the file
df = pd.read_csv('train.tsv', header=None, delimiter='\t', low_memory=False)
# labels columns
df.columns = ['PhraseID', 'SentenceID', 'Phrase', 'Sentiment']
sentences = df['Phrase'].values
y = df['Sentiment'].values

tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(sentences)
sentences = tokenizer.texts_to_matrix(sentences)

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(sentences,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=1000)

# Number of features
# print(input_dim)
model = Sequential()
model.add(layers.Dense(300, input_dim=2000, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',
test = pd.read_csv(test_path)

list_classes = [i for i in range(58)]
y = train['Category'].values
train["title"].fillna("no comment")
test["title"].fillna("no comment")

X_train = train
Y_train = y
del train
del y

raw_text_train = X_train["title"].str.lower()
raw_text_test = test["title"].str.lower()

tk = Tokenizer(num_words = max_features, lower = True)
tk.fit_on_texts(raw_text_train)
X_train["comment_seq"] = tk.texts_to_sequences(raw_text_train)
test["comment_seq"] = tk.texts_to_sequences(raw_text_test)

X_train = pad_sequences(X_train.comment_seq, maxlen = max_len)
test = pad_sequences(test.comment_seq, maxlen = max_len)

def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path))

word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
Exemple #60
0
from keras.preprocessing.text import Tokenizer
import pandas as pd
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras import models, layers
import pickle

tokenizer = Tokenizer()
nama_file = "D:\\resa\\D\\KULIAH\\S2\\Semester 1\\python\\mlNN_1\\datasetSMS\\dataset.csv"
df = pd.read_csv(nama_file).values
data = df[:, 0]
label = df[:, 1]

label = to_categorical(label)
print(label)
print(label.shape)
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    label,
                                                    test_size=0.2,
                                                    random_state=123)

# fit hanya berdasarkan data train
tokenizer.fit_on_texts(X_train)
# konversi train
seq_x_train = tokenizer.texts_to_sequences(X_train)
X_enc_train = tokenizer.sequences_to_matrix(seq_x_train, mode="tfidf")
# konversi test
seq_x_test = tokenizer.texts_to_sequences(X_test)
X_enc_test = tokenizer.sequences_to_matrix(seq_x_test, mode="tfidf")

# print(X_enc_train.shape)