Beispiel #1
0
def tokenize(texts, texts_train, texts_test):
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
    tokenizer.fit_on_texts(texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    sequences_train = tokenizer.texts_to_sequences(texts_train)
    sequences_test = tokenizer.texts_to_sequences(texts_test)
    return word_index, sequences_train, sequences_test
Beispiel #2
0
def tokenizeAndGenerateIndex(train, test, maxFeatures, maxLength):
  merged = np.concatenate([train, test])
  tokenizer = Tokenizer(nb_words=maxFeatures)
  tokenizer.fit_on_texts(merged)
  sequences_train = tokenizer.texts_to_sequences(train)
  sequences_test = tokenizer.texts_to_sequences(test)
  word_index = tokenizer.word_index
  print('Found %s unique tokens.' % len(word_index))
  data_train = pad_sequences(sequences_train, maxlen=maxLength)
  data_test = pad_sequences(sequences_test, maxlen=maxLength)
  return data_train, data_test, word_index
def get_data_1(train_sents, maxlen):
    word_list = []
    for i in range(len(train_sents)):
        for words in train_sents[i]:
            word_list.append(words)
    
    sequence=[]
    stride=1
    #applying windowing for sequence genration

    for i in range(0,len(word_list)-maxlen,stride):
        line=word_list[i:i+maxlen]
        sequence.append(line)
    
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(sequence)
    seq=tokenizer.texts_to_sequences(sequence)
    vocab_len=len(tokenizer.word_index.items())+1
    
    seq=np.array(seq)
    x_train=seq[:,:-1]
    y_train=np.zeros((x_train.shape[0],x_train.shape[1],1))
    for i in range(x_train.shape[0]):
        for j in range(x_train.shape[1]):
            y_train[i,j,0]=seq[i,j+1]
        
    return x_train,y_train,vocab_len,tokenizer
Beispiel #4
0
def get_train_val_matrix(texts, labels, max_features=10000, max_len=100):
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(texts)
    sequens = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print(f'Found {len(word_index)} unique tokens')

    data = pad_sequences(sequens, maxlen=max_len)

    labels = np.asarray(labels)

    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)

    data = data[indices]
    labels = labels[indices]

    train_sample_n = 20000
    validation_sample_n = 5000

    x_train = data[:train_sample_n]
    x_val = data[train_sample_n:validation_sample_n+train_sample_n]
    y_train = labels[:train_sample_n]
    y_val = labels[train_sample_n:validation_sample_n+train_sample_n]

    return (x_train, y_train), (x_val, y_val), word_index
def prepare_tokenizer(words):
    '''
        funtion to generate vocabulary of the given list of words
        implemented by Anindya
        @param
        words => the list of words to be tokenized
    '''
    # obtain a tokenizer
    t = Tokenizer(filters = '') # don't let keras ignore any words
    t.fit_on_texts(words)
    field_dict = dict(); rev_field_dict = dict()

    for key,value in t.word_index.items():
        field_dict[value] = key
        rev_field_dict[key] = value

    vocab_size = len(t.word_index) + 1

    ''' Small modification from Animesh
        # also add the '<unk>' token to the dictionary at 0th position
    '''
    field_dict[0] = '<unk>'; rev_field_dict['<unk>'] = 0

    #print (vocab_size)
	# integer encode the documents
    encoded_docs = t.texts_to_sequences(words)

    # print "debug: " + str(encoded_docs)

	#print(padded_docs)
    return np.array(encoded_docs), field_dict, rev_field_dict, vocab_size
Beispiel #6
0
def LoadSMILESData(duplicateProb = 0,seed=7):
    dataComp = dataset.LoadData('data',0)
    smiles = list(map(lambda x: x._SMILE, dataComp))
    tokenizer = Tokenizer(num_words=None, char_level=True)
    tokenizer.fit_on_texts(smiles)
    print(smiles[0])
    dictionary = {}
    i=0
    k=0
    for smile in smiles:
        i+=1
        for c in list(smile):
            k+=1
            if c in dictionary:
                dictionary[c]+=1
            else:
                dictionary[c]=1
    print(len(dictionary))
    # sequence encode
    encoded_docs = tokenizer.texts_to_sequences(smiles)
    # pad sequences
    max_length = max([len(s) for s in smiles])
    vocab = {'C': 1, 'c': 2, '(': 3, ')': 4, 'O': 5, '=': 6, '1': 7, 'N': 8, '2': 9, '3': 10, '[': 11, ']': 12, 'F': 13, '4': 14, 'l': 15, 'n': 16, 'S': 17, '@': 18, 'H': 19, '5': 20, '+': 21, '-': 22, 'B': 23, 'r': 24, '\\': 25, '#': 26, '6': 27, '.': 28, '/': 29, 's': 30, 'P': 31, '7': 32, 'i': 33, 'o': 34, '8': 35, 'I': 36, 'a': 37, '%': 38, '9': 39, '0': 40, 'K': 41, 'e': 42, 'A': 43, 'g': 44, 'p': 45, 'M': 46, 'T': 47, 'b': 48, 'd': 49, 'V': 50, 'Z': 51, 'G': 52, 'L': 53}
    Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    # define vocabulary size (largest integer value)
    labels = list(map(lambda x: 1 if x.mutagen==True else 0,dataComp))
    return Xtrain, labels,vocab,max_length
def preprocess_embedding():
    corpus_train, target, filenames = get_corpus()
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus_train)
    sequences = tokenizer.texts_to_sequences(corpus_train)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    MAX_SEQUENCE_LENGTH = 50
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('/home/flippped/Desktop/xiangmu/baseline/GoogleNews-vectors-negative300.bin.gz', binary=True)
    word2vec_model.init_sims(replace=True)

    # create one matrix for documents words
    EMBEDDING_DIM = 300
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    print embedding_matrix.shape
    for word, i in word_index.items():
            try:
                embedding_vector = word2vec_model[str(word)]
                if embedding_vector is not None:
                    # words not found in embedding index will be all-zeros.
                    embedding_matrix[i] = embedding_vector

            except:
                continue


    return data,target,filenames,embedding_matrix, word_index
Beispiel #8
0
def tokenizeAndGenerateIndex(texts):
    tokenizer = Tokenizer(nb_words=vocab_size)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=maxlen,padding='post')
    return data
Beispiel #9
0
    def handle(self, *args, **options):
        ptt = PTT.objects.all()
        ptt_json = PTTSerializer(ptt, many=True).data
        user_comments_times = dict()
        labels_index = 2
        labels = []
        texts = []
        for article in ptt_json:
            pointer = 1 if article['score'] > 0 else 0
            words = jieba.cut(article['contents'])
            for word in words:
                labels.append(pointer)
                texts.append(word)
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(texts)
        sequences = tokenizer.texts_to_sequences(texts)
        data = pad_sequences(sequences, maxlen=self.MAX_SEQUENCE_LENGTH)
        labels = to_categorical(np.asarray(labels))
        print('Shape of data tensor:', data.shape)
        print('Shape of label tensor:', labels.shape)
        print('Token word index:', tokenizer.word_index)
        indices = np.arange(data.shape[0])
        np.random.shuffle(indices)
        data = data[indices]
        labels = labels[indices]
        nb_validation_samples = int(self.VALIDATION_SPLIT * data.shape[0])

        x_train = data[:-nb_validation_samples]
        y_train = labels[:-nb_validation_samples]
        x_val = data[-nb_validation_samples:]
        y_val = labels[-nb_validation_samples:]


        print('Training model.')

        # train a 1D convnet with global maxpooling
        sequence_input = Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32')
        x = Embedding(output_dim=100, input_dim=len(tokenizer.word_index), input_length=self.MAX_SEQUENCE_LENGTH)(sequence_input)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(35)(x)
        x = Flatten()(x)
        x = Dense(128, activation='relu')(x)
        preds = Dense(labels_index, activation='softmax')(x)
        model = Model(sequence_input, preds)
        model.compile(loss='categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['acc'])

        # happy learning!
        model.fit(x_train, y_train, validation_data=(x_val, y_val),
                  nb_epoch=2, batch_size=64)
        score = model.evaluate(x_val, y_val, verbose=0)
        print('Test score:', score[0])
        print('Test accuracy:', score[1])
def get_tokenizer(train_comments, nwords):
    print("getting tokenizer..")
    
    t = Tokenizer(num_words=nwords)
    texts = train_comments
    t.fit_on_texts(texts)
    sequences = t.texts_to_sequences(texts)
    
    return (t,sequences)
Beispiel #11
0
def question_to_input(df_q1,df_q2):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df_q1 + df_q2)
    encoded_1 = tokenizer.texts_to_sequences(df_q1)
    encoded_2 = tokenizer.texts_to_sequences(df_q2)
    question_input_train = sequence.pad_sequences(encoded_1, maxlen=15)
    question_input_test = sequence.pad_sequences(encoded_2, maxlen=15)

    return question_input_train,question_input_test
Beispiel #12
0
def test_tokenizer_oov_flag():
    """
    Test of Out of Vocabulary (OOV) flag in Tokenizer
    """
    x_train = ['This text has only known words']
    x_test = ['This text has some unknown words']  # 2 OOVs: some, unknown

    # Default, without OOV flag
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x_train)
    x_test_seq = tokenizer.texts_to_sequences(x_test)
    assert len(x_test_seq[0]) == 4  # discards 2 OOVs

    # With OOV feature
    tokenizer = Tokenizer(oov_token='<unk>')
    tokenizer.fit_on_texts(x_train)
    x_test_seq = tokenizer.texts_to_sequences(x_test)
    assert len(x_test_seq[0]) == 6  # OOVs marked in place
def keras_classify(df):
    # 预处理,把 text 中的词转成数字编号
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing import sequence
    from keras.callbacks import EarlyStopping
    from sklearn.cross_validation import train_test_split

    print "----- Classification by Keras -----"
    max_features = 50000  # 只选最重要的词
    # Tokenizer 只能处理 str,不能处理 unicode
    textraw = map(lambda x: x.encode('utf-8'), df.seg_word.values.tolist())
    token = Tokenizer(nb_words=max_features)
    # 由于 df.seg_word 以空格相隔,故此这里 Tokenizer 直接按英文方式处理 str 即可完成分词
    token.fit_on_texts(textraw)
    # token 中记录了每个词的编号和出现次数,这里使用词编号来代替 textraw 中的词文本
    # 如 textraw = ['a b c', 'c d e f']  ==> text_seq = [[1, 2, 3], [3, 4, 5, 6]]
    text_seq = token.texts_to_sequences(textraw)
    nb_classes = len(np.unique(df.label.values))
    print "num of features(vocabulary): ", len(token.word_counts)
    print "num of labels: ", nb_classes
    max_sent_len = np.max([len(s) for s in text_seq])
    print "max length or document is: ", max_sent_len
    median_sent_len = np.median([len(s) for s in text_seq])
    print "median length or document is: ", median_sent_len

    # 这里的 df.label.values 中 values 不能忽略,否则后面 np_utils.to_categorical 时会出错
    train_X, test_X, train_y, test_y = train_test_split(text_seq, df.label.values, train_size=0.7, random_state=1)
    # 目前 train_X & test_X 仍然不是等长的,其每行都是一个 document,需要化为等长的矩阵才能训练
    seqlen = int(max_sent_len / 2 + median_sent_len / 2)
    X_train = sequence.pad_sequences(train_X, maxlen=seqlen, padding='post', truncating='post')
    X_test = sequence.pad_sequences(test_X, maxlen=seqlen, padding='post', truncating='post')
    # 把 y 格式展开为 one-hot,目的是在 nn 的最后采用 softmax
    Y_train = np_utils.to_categorical(train_y, nb_classes)
    Y_test = np_utils.to_categorical(test_y, nb_classes)

    model = build_cnn_model(max_features, seqlen, nb_classes)
    earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)
    # 训练 10 轮,每轮 mini_batch 为 32,训练完调用 earlystop 查看是否已经 ok
    model.fit(X_train, Y_train, batch_size=32, nb_epoch=10, validation_split=0.1, callbacks=[earlystop])
    evaluate(earlystop.model, X_test, Y_test, test_y)

    model = build_lstm_model(max_features, seqlen, nb_classes)
    model.fit(X_train, Y_train, batch_size=32, nb_epoch=1, validation_split=0.1)
    evaluate(model, X_test, Y_test, test_y)

    model = build_mixed_model(max_features, seqlen, nb_classes)
    model.fit(X_train, Y_train, batch_size=32, nb_epoch=1, validation_split=0.1)
    evaluate(model, X_test, Y_test, test_y)

    graph = build_graph_model(max_features, seqlen, nb_classes)
    graph.fit({'input': X_train, 'output': Y_train}, nb_epoch=3, batch_size=32, validation_split=0.1)
    predict = graph.predict({'input': X_test}, batch_size=32)
    predict = predict['output']
    classes = predict.argmax(axis=1)
    acc = np_utils.accuracy(classes, test_y)
    print('Test accuracy: ', acc)
Beispiel #14
0
def df2seq(df, nb_words):

    textraw = df.EssayText.values.tolist()
    textraw = [line.encode('utf-8') for line in textraw]  # keras needs str

    # keras deals with tokens
    token = Tokenizer(nb_words=nb_words)
    token.fit_on_texts(textraw)
    text_seq = token.texts_to_sequences(textraw)
    return(text_seq, df.Score1.values)
Beispiel #15
0
 def word_to_index(self, text, tok=None):
     real_text = [' '.join(z) for z in text]
     if tok is None:
         tokenizer = Tokenizer(lower=False, filters=" ")
         tokenizer.fit_on_texts(real_text)
     else:
         tokenizer = tok
     # here do not need the loop, just put the list of sentences (str) as input
     sequences = tokenizer.texts_to_sequences(real_text)
     # tokenizer.word_docs.items()
     return sequences, tokenizer
Beispiel #16
0
def load_mr(nb_words=20000, maxlen=64, embd_type='self'):
    """
    :param embd_type: self vs. w2v
    :return:
    """

    train_size = 0.8

    df = pickled2df('data/mr.p')
    print(df.head())

    train_X, test_X, train_y, test_y = train_test_split(df.text.values.tolist(),
                                                        df.label.values,
                                                        train_size=train_size, random_state=1)
    train_X_wds = train_X
    test_X_wds = test_X

    nb_classes = len(np.unique(train_y))
    Y_train = np_utils.to_categorical(train_y, nb_classes)
    Y_test  = np_utils.to_categorical(test_y, nb_classes)

    # tokenrize should be applied on train+test jointly
    n_ta = len(train_X)
    n_ts = len(test_X)
    print('train len vs. test len', n_ta, n_ts)

    textraw = [line.encode('utf-8') for line in train_X+test_X]  # keras needs str
    # keras deals with tokens
    token = Tokenizer(nb_words=nb_words)
    token.fit_on_texts(textraw)
    textseq = token.texts_to_sequences(textraw)

    # stat about textlist
    print('nb_words: ',len(token.word_counts))
    print('mean len: ',np.mean([len(x) for x in textseq]))

    train_X = textseq[0:n_ta]
    test_X = textseq[n_ta:]

    if(embd_type == 'self'):
        X_train = xcol_nninput_embd(train_X, nb_words, maxlen)
        X_test  = xcol_nninput_embd(test_X,  nb_words, maxlen)
    elif(embd_type == 'w2v'):
        w2v = load_w2v('data/Google_w2v.bin')
        print("loaded Google word2vec")
        X_train = sents_3dtensor(train_X_wds, maxlen, w2v)
        X_test  = sents_3dtensor(test_X_wds, maxlen, w2v)
    else:
        print('wrong embd_type')

    print('X tensor shape: ', X_train.shape)
    print('Y tensor shape: ', Y_train.shape)
    return (X_train, Y_train, X_test, Y_test, nb_classes)
Beispiel #17
0
def load_csvs(traincsv, testcsv, nb_words, maxlen, embd_type):

    train_df = pd.read_csv(traincsv)
    test_df = pd.read_csv(testcsv)
    print(train_df.head())

    train_X = train_df.text.values.tolist()
    test_X = test_df.text.values.tolist()

    # save for w2v embd
    train_X_wds = train_X
    test_X_wds = test_X

    train_y = train_df.label.values
    test_y  = test_df.label.values
    nb_classes = len(np.unique(train_y))
    Y_train = np_utils.to_categorical(train_y, nb_classes)
    Y_test = np_utils.to_categorical(test_y, nb_classes)

    # tokenrize should be applied on train+test jointly
    n_ta = len(train_X)
    n_ts = len(test_X)
    print('train len vs. test len', n_ta, n_ts)

    textraw = [line.encode('utf-8') for line in train_X+test_X]  # keras needs str
    # keras deals with tokens
    token = Tokenizer(nb_words=nb_words)
    token.fit_on_texts(textraw)
    textseq = token.texts_to_sequences(textraw)

    # stat about textlist
    print('nb_words: ', len(token.word_counts))
    print('mean len: ', np.mean([len(x) for x in textseq]))

    train_X = textseq[0:n_ta]
    test_X = textseq[n_ta:]

    if(embd_type == 'self'):
        X_train = sequence.pad_sequences(train_X, maxlen=maxlen, padding='post', truncating='post')
        X_test = sequence.pad_sequences(test_X, maxlen=maxlen, padding='post', truncating='post')
    elif(embd_type == 'w2v'):
        w2v = load_w2v('data/Google_w2v.bin')
        print("loaded Google word2vec")
        X_train = sents_3dtensor(train_X_wds, maxlen, w2v)
        X_test = sents_3dtensor(test_X_wds, maxlen, w2v)
    else:
        print('wrong embd_type')

    print('X tensor shape: ', X_train.shape)
    print('Y tensor shape: ', Y_train.shape)
    return(X_train, Y_train, X_test, Y_test, nb_classes)
Beispiel #18
0
    def trans_to_indeces(self, train_data, test_data):
        total_data = train_data + test_data
        tokenizer = Tokenizer(char_level=True)
        tokenizer.fit_on_texts(total_data)
        word_index = tokenizer.word_index

        embed_matrix = np.zeros([len(word_index) + 1, self.embed_size])
        if self.embed_path:
            for word, embeds in self._embeddings_generator(self.embed_path):
                if word in word_index:
                    embed_matrix[word_index[word]] = embeds


        train_sequences = tokenizer.texts_to_sequences(train_data)
        test_sequences = tokenizer.texts_to_sequences(test_data)
        lengths = [len(sequence) for sequence in train_sequences] + [len(sequence) for sequence in test_sequences]
        max_len = np.max(lengths)
        train_sequences = pad_sequences(train_sequences, maxlen=max_len)
        test_sequences = pad_sequences(test_sequences, maxlen=max_len)

        self.time_step = max_len

        return train_sequences, test_sequences, embed_matrix, word_index
Beispiel #19
0
def load_csvs(traincsv, testcsv, nb_words, maxlen, embd_type, w2v):

    train_df = pd.read_csv(traincsv)
    test_df = pd.read_csv(testcsv)
    print(train_df.head())

    train_X = train_df.text.values.tolist()
    test_X = test_df.text.values.tolist()

    # save for w2v embd
    train_X_wds = train_X
    test_X_wds = test_X

    train_y = train_df.label.values
    test_y = test_df.label.values
    nb_classes = len(np.unique(train_y))
    Y_train = np_utils.to_categorical(train_y, nb_classes)
    Y_test = np_utils.to_categorical(test_y, nb_classes)

    # tokenrize should be applied on train+test jointly
    n_ta = len(train_X)
    n_ts = len(test_X)
    print("train len vs. test len", n_ta, n_ts)

    textraw = [line.encode("utf-8") for line in train_X + test_X]  # keras needs str
    # keras deals with tokens
    token = Tokenizer(nb_words=nb_words)
    token.fit_on_texts(textraw)
    textseq = token.texts_to_sequences(textraw)

    # stat about textlist
    print("nb_words: ", len(token.word_counts))
    print("mean len: ", np.mean([len(x) for x in textseq]))

    train_X = textseq[0:n_ta]
    test_X = textseq[n_ta:]

    if embd_type == "self":
        X_train = sequence.pad_sequences(train_X, maxlen=maxlen, padding="post", truncating="post")
        X_test = sequence.pad_sequences(test_X, maxlen=maxlen, padding="post", truncating="post")
    elif embd_type == "w2v":
        X_train = sents_3dtensor(train_X_wds, maxlen, w2v)
        X_test = sents_3dtensor(test_X_wds, maxlen, w2v)
    else:
        print("wrong embd_type")

    print("X tensor shape: ", X_train.shape)
    print("Y tensor shape: ", Y_train.shape)
    return (X_train, Y_train, X_test, Y_test, nb_classes)
Beispiel #20
0
def tokenize(texts, max_nb_words, max_sequence_length):
    '''converts preprocessed texts into a list where each entry corresponds to a text and
    each entry is a list where entry i contains the index of ith word in the text as indexed by word_index'''
    tokenizer = Tokenizer(nb_words=max_nb_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    print()

    print('Padding sequences')
    # pads the start of the sequences with zero, up to a length of 1000
    data = pad_sequences(sequences, maxlen=max_sequence_length)
    print()
    return data, word_index, tokenizer
def prepare_tokenizer(words):
	t = Tokenizer()
	t.fit_on_texts(words)
        field_dict = dict()
        rev_field_dict = dict()
	for key,value in t.word_index.items():
                field_dict[value] = key
                rev_field_dict[key] = value
	vocab_size = len(t.word_index) + 1
	#print (vocab_size)
	# integer encode the documents
	encoded_docs = t.texts_to_sequences(words)
        # pad documents to a max length of max_length words
	padded_docs = pad_sequences(encoded_docs, maxlen=1, padding='post')
	#print(padded_docs)
        return padded_docs,field_dict,rev_field_dict,vocab_size
Beispiel #22
0
def load_labeled_data(datadir, tokenizer=None):
    print('Processing text dataset in {}'.format(datadir))
    texts = []  # list of text samples
    labels_index = {}  # dictionary mapping label name to numeric id
    labels = []  # list of label ids
    for name in sorted(os.listdir(datadir)):
        if name == 'unsup':
            continue
        path = os.path.join(datadir, name)
        if os.path.isdir(path): # each label corresponds to a separate directory
            label_id = len(labels_index)
            labels_index[name] = label_id
            for fname in sorted(os.listdir(path)):
                if fname[0].isdigit():
                    fpath = os.path.join(path, fname)
                    if sys.version_info < (3,):
                        f = open(fpath)
                    else:
                        f = open(fpath, encoding='latin-1')
                    texts.append(f.read())
                    f.close()
                    labels.append(label_id)

    print('Found {} texts in {}.'.format(len(texts), datadir))

    # finally, vectorize the text samples into a 2D integer tensor
    if not tokenizer:
        tokenizer = Tokenizer(nb_words=MAX_FEATURES)
        tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    labels =  np.asarray(labels) # to_categorical(np.asarray(labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    # randomize order
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]
    return data, labels, tokenizer
Beispiel #23
0
def build_dataset(data):
    tokenizer = Tokenizer(nb_words=1000)
    all_review_user = ""
    for single_example in data:
        all_review_user += single_example['rev'].encode('utf-8')
    tokenizer.fit_on_texts(all_review_user)
    X = []
    y = []
    for single_example in data:
        rating = int(float(single_example['rat']))
        review_seq = tokenizer.texts_to_sequences(single_example['rev'].encode('utf-8'))
        # print review_seq
        x = list(itertools.chain(*review_seq))
        X.append(x)
        y.append(rating)
        # break
    # X = sequence.pad_sequences(X, maxlen=max_len)
    X = np.asarray(X)
    y = np.asarray(y)

    return X, y
Beispiel #24
0
    def prodPadData(self, totalTextList, nb_words):
        '''
        prod word sequence padding data
        
        the order of total word sequence must corresponding to embedding matrix
        (in this function: totalTextList must same as another one in function
        prodPreWordEmbedingMat)
        '''
        
        MAX_NB_WORDS = int(nb_words / 1000) * 1000
        MAX_SEQUENCE_LENGTH = 20
        print('MAX_NB_WORDS: ' + str(MAX_NB_WORDS) + ' MAX_SEQUENCE_LENGTH: ' + str(MAX_SEQUENCE_LENGTH))
        
        # vectorize the text samples into a 2D integer tensor
        tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, lower=False)
#         for text in totalTextList:
#             print(text)
        tokenizer.fit_on_texts(totalTextList)
        totalSequences = tokenizer.texts_to_sequences(totalTextList)
        pad_data = pad_sequences(totalSequences, maxlen=MAX_SEQUENCE_LENGTH)
        
        return MAX_SEQUENCE_LENGTH, pad_data
def tokenize_and_process(text, vocab_size=10000):
    # Will hold clean text
    text_clean = []

    # List of stop words/ unwanted words
    stop = stopwords.words('english') + list(string.punctuation)

    for t in text:
        text_clean.append(" ".join([i for i in word_tokenize(t.lower()) if i not in stop and i[0] != "'"]))

    # Instantiate tokenizer
    T = Tokenizer(num_words=vocab_size)

    # Fit the tokenizer with text
    T.fit_on_texts(text_clean)

    # Turn our input text into sequences of index integers
    data = T.texts_to_sequences(text_clean)

    word_to_idx = T.word_index
    idx_to_word = {v: k for k, v in word_to_idx.items()}

    return data, word_to_idx, idx_to_word, T
Beispiel #26
0
def prepare_word_tokenizer(texts):
    if not os.path.exists('data/tokenizer.pkl'): # check if a prepared tokenizer is available
        tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)  # if not, create a new Tokenizer
        tokenizer.fit_on_texts(texts)  # prepare the word index map

        with open('data/tokenizer.pkl', 'wb') as f:
            pickle.dump(tokenizer, f)  # save the prepared tokenizer for fast access next time

        print('Saved tokenizer.pkl')
    else:
        with open('data/tokenizer.pkl', 'rb') as f:  # simply load the prepared tokenizer
            tokenizer = pickle.load(f)
            print('Loaded tokenizer.pkl')

    sequences = tokenizer.texts_to_sequences(texts)  # transform text into integer indices lists
    word_index = tokenizer.word_index  # obtain the word index map

    print('Average sequence length: {}'.format(np.mean(list(map(len, sequences)), dtype=int)))  # compute average sequence length
    print('Max sequence length: {}'.format(np.max(list(map(len, sequences)))))  # compute maximum sequence length

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)  # pad the sequence to the user defined max length

    return (data, word_index)
Beispiel #27
0
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text, result = out_word, result + ' ' + out_word
    return result


# source text
f = open('xaf', 'r')
data = f.read()

# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# create word -> word sequences
sequences = list()
for i in range(1, len(encoded)):
    sequence = encoded[i - 1:i + 1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))
# split into X and y elements
sequences = np.array(sequences)
X, y = sequences[:, 0], sequences[:, 1]
# one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)
print("X_train len: {}".format(len(X_train)))
print("y_train len: {}".format(len(y_train)))
print("X_test len: {}".format(len(X_test)))
print("y_test len: {}".format(len(y_test)))
print("Split train and test data.")

# truncate and pad input sequences
max_text_length = 700 #7356 #from calculations

t = Tokenizer()
t.fit_on_texts(X_train)
vocab_size = len(t.word_index) + 1 #vocab_size; size of vocab of training text = 256994

# integer encode the documents
encoded_train = t.texts_to_sequences(X_train)
encoded_test = t.texts_to_sequences(X_test)

# pad documents to a max length
X_train = sequence.pad_sequences(encoded_train, maxlen=max_text_length)
X_test = sequence.pad_sequences(encoded_test, maxlen=max_text_length)
print("Padded data.")
#WORD EMBEDDINGS
embeddings_index = dict()
f = open('word_vectors/glove.6B/glove.6B.300d.txt', encoding='utf-8') #Glove data
#f = open('word_vectors/pub.50.vec/pub.50.vec', encoding='latin-1') #pubmed data
embedding_vector_length = 300

for line in f:
	values = line.split()
	word = values[0]
published_post = use_data['retweet'] == 1
published_post.sum()

# +
maxlen = 50
train = 0.7
validation = 0.1
max_words = 35000

#データをランダムにシャッフル
use_data_s = use_data.sample(frac=1, random_state=1)

# word indexを作成
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(use_data_s['tweet2'])
sequences = tokenizer.texts_to_sequences(use_data_s['tweet2'])

word_index = tokenizer.word_index
print("Found {} unique tokens.".format(len(word_index)))

data = pad_sequences(sequences, maxlen=maxlen)

# バイナリの行列に変換
categorical_labels = to_categorical(use_data_s['retweet'])
labels = np.asarray(categorical_labels)

print("Shape of data tensor:{}".format(data.shape))
print("Shape of label tensor:{}".format(labels.shape))

indices = [int(len(labels) * n) for n in [train, train + validation]]
x_train, x_validation, x_test = np.split(data, indices)
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=2500, split=' ')
tokenizer.fit_on_texts(x)

# Keras 有一个内置的 API,使得准备计算文本变得更容易。tokenizer 类共有 4 个属性,可用于特征准备。请看下面的示例,了解 tokenizer 的实际功能。

## CODE
tokenizer = Tokenizer()
texts = [
    "The sun is shining in June!", "September is grey.",
    "Life is beautiful in August.", "I like it", "This and other things?"
]
tokenizer.fit_on_texts(texts)
print(tokenizer.word_index)
tokenizer.texts_to_sequences(["June is beautiful and I like it!"])
## OUPUT
# {'sun': 3, 'september': 4, 'june': 5, 'other': 6, 'the': 7, 'and': 8, 'like': 9, 'in': 2, 'beautiful': 11, 'grey': 12, 'life': 17, 'it': 16, 'i': 14, 'is': 1, 'august': 15, 'things': 10, 'shining': 13, 'this': 18}
# [[5, 1, 11, 8, 14, 9, 16]]
# tokenizer 为句子中的每个单词分配索引值,并且可以使用该索引值表示新句子。由于我们使用的文本语料库包含大量不同的单词,因此我们设置了一个上限,只使用最经常出现的 2500 个单词。

from keras.preprocessing.sequence import pad_sequences

X = tokenizer.texts_to_sequences(x)
X = pad_sequences(X)
# 现在,我们将文本转换为如上所示的数字序列,并填充数字序列。因为句子可以有不同的长度,它们的序列长度也会不同。因此,pad_sequences 会找出最长的句子,并用 0 填充其他较短语句以匹配该长度。

## Pad Sequences Example
pad_sequences([[1, 2, 3], [3, 4, 5, 6], [7, 8]])
# array([[0, 1, 2, 3],
#        [3, 4, 5, 6],
def data():
    start_train = '2008-08-08'
    end_train = '2014-12-31'
    start_val = '2015-01-02'
    end_val = '2016-07-01'
    max_sequence_length = 110
    vocab_size = 3000
    # read csv file
    DJIA = pd.read_csv("Combined_News_DJIA.csv",
                       usecols=[
                           'Date', 'Label', 'Top1', 'Top2', 'Top3', 'Top4',
                           'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10',
                           'Top11', 'Top12', 'Top13', 'Top14', 'Top15',
                           'Top16', 'Top17', 'Top18', 'Top19', 'Top20',
                           'Top21', 'Top22', 'Top23', 'Top24', 'Top25'
                       ])

    # create training and testing dataframe on 80 % and 20 % respectively
    Training_dataframe = DJIA[(DJIA['Date'] >= start_train)
                              & (DJIA['Date'] <= end_train)]
    Testing_dataframe = DJIA[(DJIA['Date'] >= start_val)
                             & (DJIA['Date'] <= end_val)]

    attrib = DJIA.columns.values

    x_train = Training_dataframe.loc[:, attrib[2:len(attrib)]]
    y_train = Training_dataframe.iloc[:, 1]

    x_test = Testing_dataframe.loc[:, attrib[2:len(attrib)]]
    y_test = Testing_dataframe.iloc[:, 1]

    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)

    # merge the 25 news together to form a single signal
    merged_x_train = x_train.apply(lambda x: ''.join(str(x.values)), axis=1)
    merged_x_test = x_test.apply(lambda x: ''.join(str(x.values)), axis=1)

    # ===============
    # pre-process
    # ===============
    merged_x_train = merged_x_train.apply(lambda x: pp.process(x))
    merged_x_test = merged_x_test.apply(lambda x: pp.process(x))

    #merged_x_train = merged_x_train.apply(lambda x: pp.lemmanouns(pp.lemmaverbs(pp.lemmaadjectives(x))))
    #merged_x_test = merged_x_test.apply(lambda x: pp.lemmanouns(pp.lemmaverbs(pp.lemmaadjectives(x))))

    #merged_x_train = merged_x_train.apply(lambda x: pp.stemmer(x))
    #merged_x_test = merged_x_test.apply(lambda x: pp.stemmer(x))

    # remove stopwords in the training and testing set
    train_without_sw = []
    test_without_sw = []
    train_temporary = list(merged_x_train)
    test_temporary = list(merged_x_test)
    s = pp.stop_words
    for i in train_temporary:
        f = i.split(' ')
        for j in f:
            if j in s:
                f.remove(j)
        s1 = ""
        for k in f:
            s1 += k + " "
        train_without_sw.append(s1)
    merged_x_train = train_without_sw

    for i in test_temporary:
        f = i.split(' ')
        for j in f:
            if j in s:
                f.remove(j)
        s1 = ""
        for k in f:
            s1 += k + " "
        test_without_sw.append(s1)
    merged_x_test = test_without_sw

    # tokenize and create sequences
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(merged_x_train)
    x_train_sequence = tokenizer.texts_to_sequences(merged_x_train)
    x_test_sequence = tokenizer.texts_to_sequences(merged_x_test)

    word_index = tokenizer.word_index
    input_dim = len(word_index) + 1
    print('Found %s unique tokens.' % len(word_index))

    x_train_sequence = pad_sequences(x_train_sequence,
                                     maxlen=max_sequence_length)
    x_test_sequence = pad_sequences(x_test_sequence,
                                    maxlen=max_sequence_length)

    print('Shape of training tensor:', x_train_sequence.shape)
    print(x_train_sequence)
    print('Shape of testing tensor:', x_test_sequence.shape)
    print(x_test_sequence)
    """
    Data providing function:
    This function is separated from create_model() so that hyperopt
    won't reload data for each evaluation run.
    """
    return x_train_sequence, y_train, x_test_sequence, y_test
Beispiel #32
0
class SummaryGeneratorClass:
    def __init__(self):
        self.news = []
        self.summaries = []

        bNews = "./BBC News Summary/News Articles/business/"
        eNews = "./BBC News Summary/News Articles/entertainment/"
        pNews = "./BBC News Summary/News Articles/politics/"
        sNews = "./BBC News Summary/News Articles/sport/"
        tNews = "./BBC News Summary/News Articles/tech/"
        self.readNews(bNews)
        self.readNews(eNews)
        self.readNews(pNews)
        self.readNews(sNews)
        self.readNews(tNews)

        bSumm = "./BBC News Summary/Summaries/business/"
        eSumm = "./BBC News Summary/Summaries/entertainment/"
        pSumm = "./BBC News Summary/Summaries/politics/"
        sSumms = "./BBC News Summary/Summaries/sport/"
        tSumm = "./BBC News Summary/Summaries/tech/"
        self.readSummaries(bSumm)
        self.readSummaries(eSumm)
        self.readSummaries(pSumm)
        self.readSummaries(sSumms)
        self.readSummaries(tSumm)

        self.contractionMapping = contractionMapping

        #         for i in range(len(self.news)):
        #             self.news[i] = self.textCleaner(self.news[i])
        #         for i in range(len(self.summaries)):
        #             self.summaries[i] = '_START_ '+ self.textCleaner(self.summaries[i]) + ' _END_'
        self.a = []
        self.b = []
        for i in range(len(self.news)):
            self.a.append(self.textCleaner(self.news[i]))
        for i in range(len(self.summaries)):
            self.b.append('beginmush ' + self.textCleaner(self.summaries[i]) +
                          ' endmush')
        self.df = pd.DataFrame({'Text': self.a, 'Summary': self.b})

    def readNews(self, directory):
        for filename in os.listdir(directory):
            with open(directory + filename, errors='replace') as infile:
                i = 1
                s = ""
                try:
                    for line in infile.readlines():
                        if i != 0:
                            if (line.isspace() == False):
                                s += str(line)
                        i += 1
#                     s = re.sub('\n', '', s)

                    s = re.sub('\'', '', s)
                    self.news.append(s)
                except:
                    print(filename + ' is throwing an error')

    def readSummaries(self, directory):
        for filename in os.listdir(directory):
            with open(directory + filename, errors='replace') as infile:
                i = 0
                s = ""
                try:
                    for line in infile.readlines():
                        if (line.isspace() == False):
                            s += str(line)


#                     s = re.sub('\n', '', s)
                    s = re.sub('\'', '', s)

                    self.summaries.append(s)
                except:
                    print(filename + ' is throwing an error')

    def textCleaner(self, string):
        stopWords = set(stopwords.words('english'))
        string = string.lower()

        string = ' '.join([
            self.contractionMapping[t] if t in self.contractionMapping else t
            for t in string.split(" ")
        ])

        #remove escape characters
        string = re.sub("(\\t)", ' ', str(string))
        string = re.sub("(\\r)", ' ', str(string))
        string = re.sub("(\\n)", ' ', str(string))

        #remove 's
        string = re.sub(r"'s\b", "", str(string))

        #remove extra spaces
        string = ' '.join(string.split())

        #remove punctuations
        string = re.sub("[^a-zA-Z]", " ", str(string))

        #remove short words
        tokens = [w for w in string.split() if not w in stopWords]
        long_words = []
        for i in tokens:
            if len(i) >= 3:  #removing short word
                long_words.append(i)
        string = (" ".join(long_words)).strip()

        return string

    def textCount(self):
        tCount = []
        summaryCount = []
        for string in self.df['Text']:
            tCount.append(len(string.split()))

        for sent in self.df['Summary']:
            summaryCount.append(len(sent.split()))

        graph = pd.DataFrame()
        graph['Text'] = tCount
        graph['Summary'] = summaryCount

        graph.hist(bins=100)
        plt.show()

        self.maxTextLen = 400
        self.maxSummaryLen = 200

        cnt = 0
        for i in self.df['Summary']:
            if (len(i.split()) <= self.maxSummaryLen):
                cnt = cnt + 1
        print(cnt / len(self.df['Summary']))

        cnt = 0
        for i in self.df['Text']:
            print(len(i.split()))
            print((self.maxTextLen))
            if (len(i.split()) <= self.maxTextLen):
                cnt = cnt + 1
        print(cnt / len(self.df['Text']))

    def filterDataFrameUsingMaxTextCountAndMaxSummaryCount(self):
        textArray = np.array(self.df['Text'])
        summaryArray = np.array(self.df['Summary'])

        shorterTextArray = []
        shorterSummaryArray = []

        for i in range(len(textArray)):
            if (len(summaryArray[i].split()) <= self.maxSummaryLen
                    and len(textArray[i].split()) <= self.maxTextLen):
                shorterTextArray.append(textArray[i])
                shorterSummaryArray.append(summaryArray[i])

        self.shorterDf = pd.DataFrame({
            'Text': shorterTextArray,
            'Summary': shorterSummaryArray
        })

    def splitData(self):
        self.xTrain, self.xVal, self.yTrain, self.yVal = train_test_split(
            np.array(self.shorterDf['Text']),
            np.array(self.shorterDf['Summary']),
            test_size=0.2,
            random_state=0,
            shuffle=True)

    def tokenizeTrainingData(self):
        tokenizerX = Tokenizer()
        tokenizerX.fit_on_texts(list(self.xTrain))
        thr = 4
        count = 0
        totalCount = 0
        freq = 0
        totalFreq = 0

        for key, value in tokenizerX.word_counts.items():
            totalCount += 1
            totalFreq += 1
            if (value < thr):
                count += 1
                freq += value

        print("% of rare words in vocabulary:", (count / totalCount) * 100)
        print("Total Coverage of rare words:", (freq / totalFreq) * 100)
        #Text Tokenizer
        self.tokenizerX = Tokenizer(num_words=totalCount - count)
        self.tokenizerX.fit_on_texts(list(self.xTrain))
        self.xTrainSeq = self.tokenizerX.texts_to_sequences(self.xTrain)
        self.xValSeq = self.tokenizerX.texts_to_sequences(self.xVal)
        self.xTrainSeq = pad_sequences(self.xTrainSeq,
                                       maxlen=self.maxTextLen,
                                       padding='post')
        self.xValSeq = pad_sequences(self.xValSeq,
                                     maxlen=self.maxTextLen,
                                     padding='post')
        self.xVocabularySize = self.tokenizerX.num_words + 1

        #Summary Tokenizer
        tokenizerY = Tokenizer()
        tokenizerY.fit_on_texts(list(self.yTrain))
        for key, value in tokenizerY.word_counts.items():
            totalCount += 1
            totalFreq += 1
            if (value < thr):
                count += 1
                freq += value

        print("% of rare words in vocabulary:", (count / totalCount) * 100)
        print("Total Coverage of rare words:", (freq / totalFreq) * 100)
        self.tokenizerY = Tokenizer(num_words=totalCount - count)

        self.tokenizerY.fit_on_texts(list(self.yTrain))

        self.yTrainSeq = self.tokenizerY.texts_to_sequences(self.yTrain)
        self.yValSeq = self.tokenizerY.texts_to_sequences(self.yVal)
        self.yTrainSeq = pad_sequences(self.yTrainSeq,
                                       maxlen=self.maxSummaryLen,
                                       padding='post')
        self.yValSeq = pad_sequences(self.yValSeq,
                                     maxlen=self.maxSummaryLen,
                                     padding='post')
        self.yVocabularySize = self.tokenizerY.num_words + 1

    def getSummaries(self):
        return self.summaries

    def getNews(self):
        return self.news

    def getDF(self):
        return self.df
train_df = pd.read_csv("data/train.csv")
print("Train shape : ", train_df.shape)
# fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values

train_X = train_df['question_text']
train_X = train_X.tolist()
qid_train = train_df['qid']
qid_train = qid_train.tolist()

# Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features,
                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'’“”')

tokenizer.fit_on_texts(train_X)
train_X = tokenizer.texts_to_sequences(train_X)

# Pad the sentences
trunc = 'pre'
train_X = pad_sequences(train_X, maxlen=maxlen, truncating=trunc)

# Get the target values
train_y = train_df['target'].values
test_X = train_X[1000000:]
train_X = train_X[:1000000]
test_y = train_y[1000000:]
train_y = train_y[:1000000]

from scipy.io import loadmat
embedding_matrix = loadmat('data/embedding_matrix2.mat')["embedding_matrix"]
print(embedding_matrix.shape)
Beispiel #34
0
    reader = csv.DictReader(csvfile, delimiter='\t')
    for row in reader:
        question1.append(row['question1'])
        question2.append(row['question2'])
        is_duplicate.append(row['is_duplicate'])

print('Question pairs: %d' % len(question1))

# ## Build tokenized word index

# In[4]:

questions = question1 + question2
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(questions)
question1_word_sequences = tokenizer.texts_to_sequences(question1)
question2_word_sequences = tokenizer.texts_to_sequences(question2)
word_index = tokenizer.word_index

print("Words in index: %d" % len(word_index))

# ## Download and process GloVe embeddings

# In[5]:

if not exists(KERAS_DATASETS_DIR + GLOVE_ZIP_FILE):
    zipfile = ZipFile(get_file(GLOVE_ZIP_FILE, GLOVE_ZIP_FILE_URL))
    zipfile.extract(GLOVE_FILE, path=KERAS_DATASETS_DIR)

print("Processing", GLOVE_FILE)
Beispiel #35
0
def trainEmbeddingLayers(imgData):
    # define documents
    docs = imgData.de

    # define class labels
    labels = array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

    # prepare tokenizer
    t = Tokenizer()
    t.fit_on_texts(docs)
    vocab_size = len(t.word_index) + 1

    # integer encode the documents
    encoded_docs = t.texts_to_sequences(docs)
    print(encoded_docs)

    # pad documents to a max length of 4 words
    max_length = 4
    padded_docs = pad_sequences(encoded_docs,
                                maxlen=max_length,
                                padding='post')
    print(padded_docs)

    # load the whole embedding into memory
    embeddings_index = dict()
    f = open('../glove_data/glove.6B/glove.6B.100d.txt')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))

    # create a weight matrix for words in training docs
    embedding_matrix = zeros((vocab_size, 100))
    for word, i in t.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    # define model
    model = Sequential()
    e = Embedding(vocab_size,
                  100,
                  weights=[embedding_matrix],
                  input_length=4,
                  trainable=False)
    model.add(e)
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))

    # compile the model
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    # summarize the model
    print(model.summary())

    # fit the model
    model.fit(padded_docs, labels, epochs=50, verbose=0)

    # evaluate the model
    loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
    print('Accuracy: %f' % (accuracy * 100))
def main():
    news = pd.read_csv('data/data_seged_monpa.csv')
    news_tag = news[['text', 'replyType', 'seg_text']]
    news_tag = news_tag[news_tag['replyType'] != 'NOT_ARTICLE']
    types = news_tag.replyType.unique()
    dic = {}
    for i, types in enumerate(types):
        dic[types] = i
    print(dic)
    news_tag['type_id'] = news_tag.replyType.apply(lambda x: dic[x])
    labels = news_tag.replyType.apply(lambda x: dic[x])
    news_tag = find_null(news_tag)
    X = news_tag.seg_text
    y = news_tag.type_id
    print(y.value_counts())
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    print(X_train.shape, 'training data ')
    print(X_test.shape, 'testing data')
    X_train = transfer_lsit(X_train)
    X_test = transfer_lsit(X_test)
    all_data = pd.concat([X_train, X_test])

    # embedding setting
    EMBEDDING_DIM = 100
    NUM_WORDS = 2764036
    vocabulary_size = NUM_WORDS
    embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
    word_vectors = word2vec.Word2Vec.load("output/word2vec.model")
    embedding_matrix = to_embedding(EMBEDDING_DIM, NUM_WORDS, vocabulary_size,
                                    embedding_matrix, word_vectors, X_train,
                                    X_test)
    del (word_vectors)

    embedding_layer = Embedding(vocabulary_size,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                trainable=True)

    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'')
    tokenizer.fit_on_texts(all_data.values)

    train_text = X_train.values
    train_index = X_train.index

    sequences_train = tokenizer.texts_to_sequences(train_text)

    X_train = pad_sequences(sequences_train, maxlen=600)

    y_train = to_categorical(np.asarray(labels[train_index]))

    print('Shape of X train:', X_train.shape)
    print('Shape of label train:', y_train.shape)

    test_text = X_test.values
    test_index = X_test.index
    sequences_test = tokenizer.texts_to_sequences(test_text)
    X_test = pad_sequences(sequences_test, maxlen=X_train.shape[1])
    y_test = to_categorical(np.asarray(labels[test_index]))

    sequence_length = X_train.shape[1]
    filter_sizes = [2, 3, 4]
    num_filters = 128
    drop = 0.2
    penalty = 0.0001

    inputs = Input(shape=(sequence_length, ))
    embedding = embedding_layer(inputs)
    reshape = Reshape((sequence_length, EMBEDDING_DIM, 1))(embedding)

    conv_0 = Conv2D(num_filters, (filter_sizes[1], EMBEDDING_DIM),
                    activation='softmax',
                    kernel_regularizer=regularizers.l2(penalty))(reshape)
    conv_1 = Conv2D(num_filters, (filter_sizes[2], EMBEDDING_DIM),
                    activation='relu',
                    kernel_regularizer=regularizers.l2(penalty))(reshape)
    conv_2 = Conv2D(num_filters, (filter_sizes[2], EMBEDDING_DIM),
                    activation='relu',
                    kernel_regularizer=regularizers.l2(penalty))(reshape)

    maxpool_0 = MaxPooling2D((sequence_length - filter_sizes[1] + 1, 1),
                             strides=(1, 1))(conv_0)

    maxpool_1 = MaxPooling2D((sequence_length - filter_sizes[2] + 1, 1),
                             strides=(1, 1))(conv_1)
    maxpool_2 = MaxPooling2D((sequence_length - filter_sizes[2] + 1, 1),
                             strides=(1, 1))(conv_2)

    merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
    dropout = Dropout(drop)(merged_tensor)
    flatten = Flatten()(dropout)
    reshape = Reshape((3 * num_filters, ))(flatten)
    output = Dense(units=2,
                   activation='softmax',
                   kernel_regularizer=regularizers.l2(penalty))(reshape)

    # this creates a model that includes
    model = Model(inputs, output)
    model.summary()

    adam = Adam(lr=1e-3)

    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['acc'])
    callbacks = [EarlyStopping(monitor='val_loss')]
    history = model.fit(X_train,
                        y_train,
                        batch_size=64,
                        epochs=50,
                        verbose=1,
                        validation_split=0.1,
                        callbacks=callbacks)

    predictions = model.predict(X_test)
    matrix = confusion_matrix(y_test.argmax(axis=1), predictions.argmax(axis=1))
    print(matrix)

    # Plot training & validation accuracy values
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'val'], loc='upper left')
    plt.savefig("output/acc.png")
    score, acc = model.evaluate(X_test, y_test)
    print('Test accuracy:', acc)

    plot_model(model,
               to_file='output/model.png',
               show_shapes=False,
               show_layer_names=False)
    question2 = []
    is_duplicate = []
    with open(KERAS_DATASETS_DIR + QUESTION_PAIRS_FILE, encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile, delimiter='\t')
        for row in reader:
            question1.append(row['question1'])
            question2.append(row['question2'])
            is_duplicate.append(row['is_duplicate'])

    print('Question pairs: %d' % len(question1))

    # Build tokenized word index
    questions = question1 + question2
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(questions)
    question1_word_sequences = tokenizer.texts_to_sequences(question1)
    question2_word_sequences = tokenizer.texts_to_sequences(question2)
    word_index = tokenizer.word_index

    print("Words in index: %d" % len(word_index))

    # Download and process GloVe embeddings
    if not exists(KERAS_DATASETS_DIR + GLOVE_ZIP_FILE):
        zipfile = ZipFile(get_file(GLOVE_ZIP_FILE, GLOVE_ZIP_FILE_URL))
        zipfile.extract(GLOVE_FILE, path=KERAS_DATASETS_DIR)

    print("Processing", GLOVE_FILE)

    embeddings_index = {}
    with open(KERAS_DATASETS_DIR + GLOVE_FILE, encoding='utf-8') as f:
        for line in f:
Beispiel #38
0
word_counter = collections.Counter([
    word for sentence in tqdm(X_train, total=len(X_train))
    for word in sentence.split()
])

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=None,
                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                      lower=True,
                      split=' ',
                      char_level=False)

tokenizer.fit_on_texts(list(X_train))
tokenized_train = tokenizer.texts_to_sequences(X_train)
tokenized_test = tokenizer.texts_to_sequences(X_test)
word_index = tokenizer.word_index

vocab_size = len(word_index)
longest = max(len(seq) for seq in tokenized_train)
average = np.mean([len(seq) for seq in tokenized_train])
stdev = np.std([len(seq) for seq in tokenized_train])
max_len = int(average + stdev * 3)

processed_X_train = pad_sequences(
    tokenized_train, maxlen=max_len, padding='post',
    truncating='post')  # pad all sequences to the same length
processed_X_test = pad_sequences(tokenized_test,
                                 maxlen=max_len,
                                 padding='post',
print('nb_classes = %s' % nb_classes)

y_train = labeler.transform(raw_train_labels)
Y_train = np_utils.to_categorical(y_train, nb_classes)

y_valid = labeler.transform(raw_valid_labels)
Y_valid = np_utils.to_categorical(y_valid, nb_classes)

y_test = labeler.transform(raw_test_labels)
Y_test = np_utils.to_categorical(y_test, nb_classes)

print 'Tokenizing X_train'
tokenizer = Tokenizer()
tokenizer.fit_on_texts(raw_train['NARRATIVE'])
tokenizer.word_index['BLANK_WORD'] = 0
X_train = tokenizer.texts_to_sequences(raw_train['NARRATIVE'])
X_valid = tokenizer.texts_to_sequences(raw_valid['NARRATIVE'])
X_test = tokenizer.texts_to_sequences(raw_test['NARRATIVE'])
X_train = pad_sequences(X_train, maxlen=max_len)
X_valid = pad_sequences(X_valid, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
print('X_train shape:', X_train.shape)
print('X_valid shape:', X_valid.shape)
print('X_test shape:', X_test.shape)


vocab_size = len(tokenizer.word_index)


model = Sequential()
model.add(LSTM(50, dropout_W=0.5, dropout_U=0.5, return_sequences=True, input_shape=(max_len, vocab_size)))

# %%
test_x.head()

# %% [markdown]
# ## Data Preprocessing

# %%
max_words = 20000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(list(train_x))


# %%
tokenized_train_x = tokenizer.texts_to_sequences(train_x)
tokenized_valid_x = tokenizer.texts_to_sequences(valid_x)
tokenized_test_x = tokenizer.texts_to_sequences(test_x)


# %%
len(tokenized_train_x), len(tokenized_valid_x), len(tokenized_test_x)


# %%
maxlen = 200
X_train = pad_sequences(tokenized_train_x, maxlen=maxlen)
X_valid = pad_sequences(tokenized_valid_x, maxlen=maxlen)
X_test = pad_sequences(tokenized_test_x, maxlen=maxlen)

    additional_features.append(feature_getter(i))

additional_features = np.asarray(additional_features)
for i in sentences:
    temp1 = np.zeros((1, EMBEDDING_DIM))
    for w in i:
        if (w in glove_emb):
            temp1 += glove_emb[w]
    temp1 /= len(i)
    doctovec.append(temp1.reshape(300, ))

doctovec = np.asarray(doctovec)

tokenizer = Tokenizer()  #num_words=MAX_NB_WORDS) #limits vocabulory size
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)  #returns list of sequences
word_index = tokenizer.word_index  #dictionary mapping
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

print(additional_features.shape, data.shape)

print('Shape of data tensor:', data.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
doctovec = doctovec[indices]
additional_features = additional_features[indices]
Beispiel #42
0
train = pd.read_csv(TRAIN_DATA)
test = pd.read_csv(TEST_DATA)
submission = pd.read_csv(SAMPLE_SUB)

# Replace missing values in training and test set
list_train = train["comment_text"].fillna("_na_").values
classes = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]
y = train[classes].values
list_test = test["comment_text"].fillna("_na_").values

# Use Keras preprocessing tools
tok = Tokenizer(num_words=max_features)
tok.fit_on_texts(list(list_train))
tokenized_train = tok.texts_to_sequences(list_train)
tokenized_test = tok.texts_to_sequences(list_test)

# Pad vectors with 0s for sentences shorter than maxlen
X_t = pad_sequences(tokenized_train, maxlen=maxlen)
X_te = pad_sequences(tokenized_test, maxlen=maxlen)


# Read word vectors into a dictionary
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


embeddings_index = dict(
    get_coefs(*o.strip().split(" ")) for o in open(EMBEDDING))
Beispiel #43
0
    r_len.append(l)

MAX_REVIEW_LEN = np.max(r_len)
MAX_REVIEW_LEN


max_features = num_unique_word
max_words = MAX_REVIEW_LEN
batch_size = 128
epochs = 10
num_classes=5


tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train_text))
X_train = tokenizer.texts_to_sequences(X_train_text)
X_val = tokenizer.texts_to_sequences(X_val_text)
X_test = tokenizer.texts_to_sequences(test_text)


X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_val = sequence.pad_sequences(X_val, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)
print(X_train.shape,X_val.shape,X_test.shape)


model2= Sequential()
model2.add(Embedding(max_features,100,input_length=max_words))
model2.add(Dropout(0.2))

model2.add(Conv1D(64,kernel_size=3,padding='same',activation='relu',strides=1))
                    'encoding': 'latin-1'
                }
                with open(fpath, **args) as f:
                    t = f.read()
                    i = t.find('\n\n')  # skip header
                    if 0 < i:
                        t = t[i:]
                    texts.append(t)
                labels.append(label_id)

print('Found %s texts.' % len(texts))

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
Beispiel #45
0
#23-ast akar, de 24-est kap

#first_text = list(csv.reader(szoveg, skipinitialspace=True))
#print (first_text[0:20]) # ez így karaktereket ad vissza
#print(list(csv.reader(szoveg, skipinitialspace=True))[0:4])

#
# A SZÖVEGMINTÁK VEKTORIZÁLÁSA
#

# Tokenizálás - szavak helyett indexek lesznek, ahol ugyanaz a szó szerepel, ott ugyanaz a szám...
# nem véletlenszerű, hogy milyen szám szerepel az egyes szavak helyén(?)!
tokenizer = Tokenizer(
    nb_words=MAX_NB_WORDS)  ## Tokenizál, legfeljebb MAX_NB_WORDS szóra
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(
    texts)  # Soronként egy-egy szöveg szavai következnek egymás után listában.

word_index = tokenizer.word_index  # különböző tokenek száma
print('Különböző szavak száma az összes szövegben: ', len(word_index))
print('tokenz: \n')
#print(word_index)

# ----------------------------------
import json

# as requested in comment
#word_index = {'word_index': word_index}

with open('target.txt', 'w') as file:
    txt = ""
    for key in word_index:
class ToxModel():
  """Toxicity model."""

  def __init__(self,
               model_name=None,
               model_dir=DEFAULT_MODEL_DIR,
               embeddings_path=DEFAULT_EMBEDDINGS_PATH,
               hparams=None):
    self.model_dir = model_dir
    self.embeddings_path = embeddings_path
    self.model_name = model_name
    self.model = None
    self.tokenizer = None
    self.hparams = DEFAULT_HPARAMS.copy()
    if hparams:
      self.update_hparams(hparams)
    if model_name:
      self.load_model_from_name(model_name)
    self.print_hparams()

  def print_hparams(self):
    print('Hyperparameters')
    print('---------------')
    for k, v in six.iteritems(self.hparams):
      print('{}: {}'.format(k, v))
    print('')

  def update_hparams(self, new_hparams):
    self.hparams.update(new_hparams)

  def get_model_name(self):
    return self.model_name

  def save_hparams(self, model_name):
    self.hparams['model_name'] = model_name
    with open(
        os.path.join(self.model_dir, '%s_hparams.json' % self.model_name),
        'w') as f:
      json.dump(self.hparams, f, sort_keys=True)

  def load_model_from_name(self, model_name):
    self.model = load_model(
        os.path.join(self.model_dir, '%s_model.h5' % model_name))
    self.tokenizer = six.moves.cPickle.load(
        open(
            os.path.join(self.model_dir, '%s_tokenizer.pkl' % model_name),
            'rb'))
    with open(
        os.path.join(self.model_dir, '%s_hparams.json' % self.model_name),
        'r') as f:
      self.hparams = json.load(f)

  def fit_and_save_tokenizer(self, texts):
    """Fits tokenizer on texts and pickles the tokenizer state."""
    self.tokenizer = Tokenizer(num_words=self.hparams['max_num_words'])
    self.tokenizer.fit_on_texts(texts)
    six.moves.cPickle.dump(
        self.tokenizer,
        open(
            os.path.join(self.model_dir, '%s_tokenizer.pkl' % self.model_name),
            'wb'))

  def prep_text(self, texts):
    """Turns text into into padded sequences.

    The tokenizer must be initialized before calling this method.

    Args:
      texts: Sequence of text strings.

    Returns:
      A tokenized and padded text sequence as a model input.
    """
    text_sequences = self.tokenizer.texts_to_sequences(texts)
    return pad_sequences(
        text_sequences, maxlen=self.hparams['max_sequence_length'])

  def load_embeddings(self):
    """Loads word embeddings."""
    embeddings_index = {}
    with open(self.embeddings_path) as f:
      for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

    self.embedding_matrix = np.zeros((len(self.tokenizer.word_index) + 1,
                                      self.hparams['embedding_dim']))
    num_words_in_embedding = 0
    for word, i in self.tokenizer.word_index.items():
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
        num_words_in_embedding += 1
        # words not found in embedding index will be all-zeros.
        self.embedding_matrix[i] = embedding_vector

  def train(self, training_data_path, validation_data_path, text_column,
            label_column, model_name):
    """Trains the model."""
    self.model_name = model_name
    self.save_hparams(model_name)

    train_data = pd.read_csv(training_data_path)
    valid_data = pd.read_csv(validation_data_path)

    print('Fitting tokenizer...')
    self.fit_and_save_tokenizer(train_data[text_column])
    print('Tokenizer fitted!')

    print('Preparing data...')
    train_text, train_labels = (self.prep_text(train_data[text_column]),
                                to_categorical(train_data[label_column]))
    valid_text, valid_labels = (self.prep_text(valid_data[text_column]),
                                to_categorical(valid_data[label_column]))
    print('Data prepared!')

    print('Loading embeddings...')
    self.load_embeddings()
    print('Embeddings loaded!')

    print('Building model graph...')
    self.build_model()
    print('Training model...')

    save_path = os.path.join(self.model_dir, '%s_model.h5' % self.model_name)
    callbacks = [
        ModelCheckpoint(
            save_path, save_best_only=True, verbose=self.hparams['verbose'])
    ]

    if self.hparams['stop_early']:
      callbacks.append(
          EarlyStopping(
              min_delta=self.hparams['es_min_delta'],
              monitor='val_loss',
              patience=self.hparams['es_patience'],
              verbose=self.hparams['verbose'],
              mode='auto'))

    self.model.fit(
        train_text,
        train_labels,
        batch_size=self.hparams['batch_size'],
        epochs=self.hparams['epochs'],
        validation_data=(valid_text, valid_labels),
        callbacks=callbacks,
        verbose=2)
    print('Model trained!')
    print('Best model saved to {}'.format(save_path))
    print('Loading best model from checkpoint...')
    self.model = load_model(save_path)
    print('Model loaded!')

  def build_model(self):
    """Builds model graph."""
    sequence_input = Input(
        shape=(self.hparams['max_sequence_length'],), dtype='int32')
    embedding_layer = Embedding(
        len(self.tokenizer.word_index) + 1,
        self.hparams['embedding_dim'],
        weights=[self.embedding_matrix],
        input_length=self.hparams['max_sequence_length'],
        trainable=self.hparams['embedding_trainable'])

    embedded_sequences = embedding_layer(sequence_input)
    x = embedded_sequences
    for filter_size, kernel_size, pool_size in zip(
        self.hparams['cnn_filter_sizes'], self.hparams['cnn_kernel_sizes'],
        self.hparams['cnn_pooling_sizes']):
      x = self.build_conv_layer(x, filter_size, kernel_size, pool_size)

    x = Flatten()(x)
    x = Dropout(self.hparams['dropout_rate'])(x)
    # TODO(nthain): Parametrize the number and size of fully connected layers
    x = Dense(128, activation='relu')(x)
    preds = Dense(2, activation='softmax')(x)

    rmsprop = RMSprop(lr=self.hparams['learning_rate'])
    self.model = Model(sequence_input, preds)
    self.model.compile(
        loss='categorical_crossentropy', optimizer=rmsprop, metrics=['acc'])

  def build_conv_layer(self, input_tensor, filter_size, kernel_size, pool_size):
    output = Conv1D(
        filter_size, kernel_size, activation='relu', padding='same')(
            input_tensor)
    if pool_size:
      output = MaxPooling1D(pool_size, padding='same')(output)
    else:
      # TODO(nthain): This seems broken. Fix.
      output = GlobalMaxPooling1D()(output)
    return output

  def predict(self, texts):
    """Returns model predictions on texts."""
    data = self.prep_text(texts)
    return self.model.predict(data)[:, 1]

  def score_auc(self, texts, labels):
    preds = self.predict(texts)
    return compute_auc(labels, preds)

  def summary(self):
    return self.model.summary()
def tokenize(max_features,
             max_len,
             on='train',
             train_path='f:/avito/train.csv',
             test_path=None,
             tokenizer=None,
             clean_text=False,
             return_tokenizer=False,
             return_full_train=False):
    """
    Tokenize text.

    Read train and test data, process description feature, tokenize it.
    Parameters:
    - on: fit tokenizer on train or train + test;
    - train_path: path to train file;
    - test_path: past to test file;
    - max_features: tokenizer parameter;
    - max_len: tokenizer parameter;
    - tokenizer: can pass tokenizer with different parameters or use a default one;
    - clean_text: apply text cleaning or not;
    """
    # check that "on" has a correct value.
    assert on in ['train', 'all']

    print('Reading train data.')
    train = pd.read_csv(train_path, index_col=0)
    labels = train['deal_probability'].values
    train = train['description'].astype(str).fillna('')
    text = train

    # define tokenizer
    if tokenizer:
        tokenizer = tokenizer
    else:
        tokenizer = Tokenizer(num_words=max_features)

    if on == 'all':
        print('Reading test data.')
        test = pd.read_csv(test_path, index_col=0)
        test = test['description'].astype(str).fillna('')
        text = text.append(test)

    # clean text
    if clean_text:
        pass
        # print('Cleaning.')

    print('Fitting.')
    tokenizer.fit_on_texts(text)

    # split data
    X_train, X_valid, y_train, y_valid = train_test_split(train,
                                                          labels,
                                                          test_size=0.1,
                                                          random_state=23)
    print('Converting to sequences.')
    X_train = tokenizer.texts_to_sequences(X_train)
    X_valid = tokenizer.texts_to_sequences(X_valid)
    if test_path:
        test = tokenizer.texts_to_sequences(test)

    print('Padding.')
    X_train = sequence.pad_sequences(X_train, maxlen=max_len)
    X_valid = sequence.pad_sequences(X_valid, maxlen=max_len)
    if test_path:
        test = sequence.pad_sequences(test, maxlen=max_len)

    data = {}
    data['X_train'] = X_train
    data['X_valid'] = X_valid
    data['y_train'] = y_train
    data['y_valid'] = y_valid
    if test_path:
        data['test'] = test

    if return_tokenizer:
        data['tokenizer'] = tokenizer

    if return_full_train:
        X = np.concatenate([X_train, X_valid])
        y = np.concatenate([y_train, y_valid])
        data['X'] = X
        data['y'] = y

    return data
Beispiel #48
0
# Paths
tweets_path = "D:/tweets.csv"
w_emb_path = 'D:/GoogleNews-vectors-negative300.bin'

# Read data and preprocess
tweet_data = read_data(tweets_path)
tweet_data["Tweet"] = tweet_data["Tweet"].apply(preprocess_tweet)

tweet_data = Filter_tweets(tweet_data, True)

# Transform tweets to list of integers and add pad
number_of_features = 5000
tokenizer = Tokenizer(num_words=number_of_features, split=' ')
tokenizer.fit_on_texts(tweet_data['filtered_text'].values)
X = tokenizer.texts_to_sequences(tweet_data['filtered_text'].values)
X = pad_sequences(X)

word_index = tokenizer.word_index

embedding_dims = 300

# Load embeddings
model = gensim.models.KeyedVectors.load_word2vec_format(w_emb_path,
                                                        binary=True)

# Create embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dims))
for word, i in word_index.items():
    if word in model.wv.vocab:
        embedding_vector = model.wv[word]
Beispiel #49
0
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('spam.csv', encoding='latin-1')
# Keeping only the neccessary columns
data = data[['v2', 'v1']]

data['v2'] = data['v2'].apply(lambda x: x.lower())
data['v2'] = data['v2'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

print(data[data['v1'] == 'ham'].size)
print(data[data['v1'] == 'spam'].size)

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['v2'].values)
X = tokenizer.texts_to_sequences(data['v2'].values)
print(X)
X = pad_sequences(X)
print(X)
embed_dim = 128
lstm_out = 196


def createmodel():
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim, input_length=X.shape[1]))
    model.add(SpatialDropout1D(0.4))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
Beispiel #50
0
X=[]
sentences = list(tweets['text'])
for sen in sentences:
    X.append(preprocess_text(sen))
y = tweets['sentiment']

#sentiment를 긍정은 1, 부정은 0으로 수정
y = np.array(list(map(lambda x: 1 if x == 4 else 0, y)))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

#Preparing the Embedding Layer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

maxlen = 69

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


new_model = keras.models.load_model('./lstm모델/DB2048_twitter100D_69_30_lstm_model.h5', compile=False)
new_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
new_model.summary()
loss, acc = new_model.evaluate(X_test, y_test, verbose=1)
    print('embedding_matrix shape', embedding_matrix.shape)
    # print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
    return embedding_matrix


df = pd.read_csv(input_file, encoding="utf-8")

question1 = df['question1'].values
question2 = df['question2'].values
y = df['label'].values
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(list(question1) + list(question2))
list_tokenized_question1 = tokenizer.texts_to_sequences(question1)
list_tokenized_question2 = tokenizer.texts_to_sequences(question2)
X_train_q1 = pad_sequences(list_tokenized_question1, maxlen=MAX_TEXT_LENGTH)
X_train_q2 = pad_sequences(list_tokenized_question2, maxlen=MAX_TEXT_LENGTH)

inpath="test1.txt"
test_data1 = []
test_data2 = []
linenos=[]
import jieba
jieba.add_word('花呗')
jieba.add_word('借呗')
jieba.add_word('余额宝')

def seg(text):
    seg_list = jieba.cut(text)
Beispiel #52
0
dataset_df = read_csv(DATASET_PATH)
dataset_df = dataset_df.dropna()
dataset_df = dataset_df.sample(frac=1)

dataset_df.info()

print("Label Distributions: ", dataset_df['Label'].value_counts())

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

tokenizer.fit_on_texts(dataset_df['Log'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = tokenizer.texts_to_sequences(dataset_df['Log'].values)
X = sequence.pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Y = pd.get_dummies(dataset_df['Label']).values
print('Shape of label tensor:', Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.20,
                                                    random_state=42)

epochs = 5
batch_size = 1000

model = Sequential()
Beispiel #53
0
print('단어 집합(vocabulary)의 크기 :', total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s' % (threshold - 1, rare_cnt))
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt) * 100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq) * 100)

# In[ ]:

print('최대 길이 :', max(len(l) for l in x_train))
print('평균 길이 :', sum(map(len, x_train)) / len(x_train))

# In[ ]:

tokenizer = Tokenizer(vocab_size, oov_token='OOV')
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

# In[ ]:

max_len = 58

# In[ ]:

x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)

# In[ ]:

import re
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional
Beispiel #54
0
        continue

    input_line = '<sos> ' + line
    target_line = line + ' <eos>'

    input_texts.append(input_line)
    target_texts.append(target_line)

all_lines = input_texts + target_texts

# convert the sentences (strings) into integers
tokenizer = Tokenizer(
    num_words=MAX_VOCAB_SIZE, filters=''
)  # filters='' ensures that special characters like <sos> are not filtered out
tokenizer.fit_on_texts(all_lines)
input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)

# find max seq length
max_sequence_length_from_data = max(len(s) for s in input_sequences)
print('Max sequence length:', max_sequence_length_from_data)

# get word -> integer mapping (dictionary)
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))
assert ('<sos>' in word2idx)
assert ('<eos>' in word2idx)

# pad sequences so that we get a N x T matrix
max_sequence_length = min(max_sequence_length_from_data, MAX_SEQUENCE_LENGTH)
input_sequences = pad_sequences(input_sequences,
def model_word2vec(suffix="",suffix_fre=""):
    rtest=xlrd.open_workbook(filename="切割"+suffix+"test"+suffix_fre+".xls")
    rtrain=xlrd.open_workbook(filename="切割"+suffix+"train"+suffix_fre+".xls")
    r_vocall1=xlrd.open_workbook(filename="pre处理"+suffix+"test"+suffix_fre+".xls")
    r_vocall2=xlrd.open_workbook(filename="pre处理"+suffix+"train"+suffix_fre+".xls")
    sheet_test=rtest.sheet_by_index(0)
    sheet_train=rtrain.sheet_by_index(0)
    sheet1_vocall=r_vocall1.sheet_by_index(0)
    sheet2_vocall=r_vocall2.sheet_by_index(0)
    invocal1=sheet1_vocall.col_values(4)
    invocal2=sheet2_vocall.col_values(4)
    for i in range(0,len(invocal1)):
        if len(invocal1[i])==0:
            invocall=invocal1[:i]
            print("1")
            break
        
    for i in range(0,len(invocal2)):
        if len(invocal2[i])==0:
                print("1")
                invocal2=invocal2[:i]
                break
    for i in invocal2:
        if i not in invocall:
            invocall.append(i)
    print(len(invocall))
    vocall_size=len(invocall)
    class_num=sheet2_vocall.col_values(10)
    allclass=[]
    for i in range(1,len(class_num)):
        if class_num[i]!="":
            if class_num[i] not in allclass:
                allclass.append(class_num[i])
    print(allclass)

    for all_round in range(0,len(allclass)):
        for round in range(0,2):

            if round==1:
                ex_tag=sheet_test.col_values(6)
            xtrain=sheet_train.col_values(2+round*3)
            ztrain=sheet_train.col_values(0+round*3)
            ytrain=sheet_train.col_values(1+round*3)
            xtest=sheet_test.col_values(2+round*3)
            ztest=sheet_test.col_values(0+round*3)
            ytest=sheet_test.col_values(1+round*3)
        
            for i in range(0,len(xtrain)):
                if len(xtrain[i])==0:
                    xtrain=xtrain[:i]
                    ztrain=ztrain[:i]
                    ytrain=ytrain[:i]
                    break
            for i in range(0,len(xtest)):
                if len(xtest[i])==0:
                    xtest=xtest[:i]
                    ytest=ytest[:i]
                    ztest=ztest[:i]
                    break
            print(round*3)
            print(len(xtrain),"xtrain")
            print(len(ztrain),"ztrain")
            print(len(xtest),"xtest")
            print(len(ztest),"ztest")
            if round==1:
                other=sheet_train.cell(0,13).value
                other=int(other)
                print(other)
                if other==1:
                    xtrain=xtrain+sheet_train.col_values(9)
                    ytrain=ytrain+sheet_train.col_values(8)
                    ztrain=ztrain+sheet_train.col_values(7)
                    for i in range(0,len(xtrain)):
                        if len(xtrain[i])==0:
                            xtrain=xtrain[:i]
                            ztrain=ztrain[:i]
                            ytrain=ytrain[:i]
                            break

            tokenizer=Tokenizer(num_words=vocall_size)
            tokenizer.fit_on_texts(invocall)
            xtrain=tokenizer.texts_to_sequences(xtrain)
            xtest=tokenizer.texts_to_sequences(xtest)
            maxlen=0
            for i in xtrain:
                if len(i)>maxlen:
                    maxlen=len(i)
            for i in xtest:
                if len(i)>maxlen:
                    maxlen=len(i)
            print(maxlen,"maxlen")
            for i in range(0,len(ztest)):
                for n1 in range(0,len(allclass)):
                    if ztest[i]==allclass[n1]:
                        ztest[i]=n1
            for i in range(0,len(ztrain)):
                for n1 in range(0,len(allclass)):
                    if ztrain[i]==allclass[n1]:
                        ztrain[i]=n1
            xtrain=pad_sequences(xtrain,padding='post',maxlen=maxlen)
            xtest=pad_sequences(xtest,padding='post',maxlen=maxlen)
            print(len(ztrain),len(xtrain))
            print(len(ztest),len(xtest))
            for i in range(0,len(ztrain)):
                ztrain[i]=int(ztrain[i])
            for i in range(0,len(ztest)):
                ztest[i]=int(ztest[i])
            modelw2v = gensim.models.KeyedVectors.load("word2vec_150_lstm.model")
            embedding_matrix = np.zeros(shape=(vocall_size ,150))
            for i,word in enumerate(invocall):
                try:
                    embedding_vector = modelw2v[word]
                    embedding_matrix[i,:] = embedding_vector
                except KeyError:
                    pass
            embedding_size=150
            hidden_layer_size=64
            batch_size=128
            num_epochs=3
            model=Sequential()
            model.add(Embedding(vocall_size,embedding_size,weights=[embedding_matrix],input_length=maxlen))     
            model.add(SpatialDropout1D(0.2))
            model.add(LSTM(hidden_layer_size,dropout=0.2,recurrent_dropout=0.2))
            model.add(Dense(1))
            model.add(Activation("sigmoid"))
            model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
            model.summary()
        
            history=model.fit(xtrain,ztrain,epochs=1,batch_size=64)
            loss,accuracy=model.evaluate(xtest,ztest)
            print(loss,accuracy)
            """
            plt.subplot(211)
            plt.title("Accuracy"+suffix)
            plt.plot(history.history['acc'],color="g",label="Train")
      
            plt.legend(loc="best")

            plt.subplot(212)
            plt.title("Loss")
            plt.plot(history.history['loss'],color="g",label="Train")
       
            plt.legend(loc="best")

            plt.tight_layout()
            plt.show()
            """

            w=xlwt.Workbook()
            sheet2=w.add_sheet("准备文件",cell_overwrite_ok=True)
            sheet2.write(0,8,"predict")
            sheet2.write(0,9,"ztest")
            sheet2.write(0,10,"xtest")
            sheet2.write(0,11,"ex_tag")
            sheet2.write(0,4,"loss")
            sheet2.write(1,4,loss)
            sheet2.write(0,5,"acc")
            sheet2.write(1,5,accuracy)
            ypred=model.predict_classes(xtest,1)
            xtest=tokenizer.sequences_to_texts(xtest)
            for index in range(0,len(ypred)):
                sheet2.write(index,0,int(ypred[index][0]))
                sheet2.write(index,1,ztest[index])
                sheet2.write(index,2,xtest[index])
                if round==1:
                    sheet2.write(index,3,ex_tag[index])
            if round==0:
                w.save("result切割"+suffix+allclass[all_round]+suffix_fre+"w2v.xls")
            else:
                w.save("result扩充"+suffix+allclass[all_round]+suffix_fre+"w2v.xls")
    return allclass
Beispiel #56
0
    def __init__(
            self,
            embed_files,
            seq_length=320,  # 320
            embed_flag='crawl',
            sent_flag=False):
        self.train_file = TRAIN_DATA_FILE
        self.process_files = TRAIN_PROCESS_FILES
        self.test_file = TEST_DATA_FILE
        self.embed_file = embed_files[embed_flag]
        self.seq_length = seq_length

        print(f'read train data: {self.train_file} '
              f'and test data: {self.test_file}')
        self.train_df = pd.read_csv(self.train_file)
        self.test_df = pd.read_csv(self.test_file)

        self.train_df["comment_text"].fillna(NAN_WORD, inplace=True)
        self.test_df["comment_text"].fillna(NAN_WORD, inplace=True)

        sentences_train = self.train_df["comment_text"].values
        sentences_test = self.test_df["comment_text"].values
        self.y_train = self.train_df[CLASSES].values

        print(f'train sentences shape: {sentences_train.shape}')
        print(f'test sentences shape: {sentences_test.shape}')
        print(f'y train shape: {self.y_train.shape}')

        sentences_all = list(sentences_train)
        sentences_procs = []
        sentences_df_procs = []
        for train_pro in self.process_files:
            df = pd.read_csv(train_pro)
            df["comment_text"].fillna(NAN_WORD, inplace=True)
            sent = df["comment_text"].values
            sentences_procs.append(sent)
            sentences_all.extend(list(sent))
            sentences_df_procs.append(df)

        print('Tokenzie sentence in train set and test set...')
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(sentences_all)

        if not sent_flag:
            tokenized_train = tokenizer.texts_to_sequences(sentences_train)
            self.x_train = pad_sequences(tokenized_train, maxlen=seq_length)

            tokenized_test = tokenizer.texts_to_sequences(sentences_test)
            self.x_test = pad_sequences(tokenized_test, maxlen=seq_length)

            self.x_procs = []
            for sent in sentences_procs:
                tokenized_procs = tokenizer.texts_to_sequences(sent)
                tokenized_procs = pad_sequences(tokenized_procs,
                                                maxlen=seq_length)
                self.x_procs.append(tokenized_procs)
        else:
            sentences_train = self.train_df["comment_text"].apply(
                lambda x: tokenize.sent_tokenize(x))
            sentences_test = self.test_df["comment_text"].apply(
                lambda x: tokenize.sent_tokenize(x))

            max_sent = 5
            self.x_train = self.sentenize(tokenizer, sentences_train, max_sent,
                                          seq_length)
            self.x_test = self.sentenize(tokenizer, sentences_test, max_sent,
                                         seq_length)

            self.x_procs = []
            for sent_df in sentences_df_procs:
                sentences_df = sent_df["comment_text"].apply(
                    lambda x: tokenize.sent_tokenize(x))
                tokenized_procs = self.sentenize(tokenizer, sentences_df,
                                                 max_sent, seq_length)
                self.x_procs.append(tokenized_procs)

        words_dict = tokenizer.word_index
        self.max_feature = len(words_dict) + 1

        print(f'Loading {embed_flag} embeddings...')
        if embed_flag is 'wiki':
            ft_model = load_model(self.embed_file)
            self.embed_dim = ft_model.get_dimension()
            self.embed_matrix = self.get_wiki_embed_matrix(
                words_dict, ft_model)
        elif embed_flag is 'crawl':
            embed_index = self.load_crawl_embed_index(self.embed_file)
            self.embed_dim = list(embed_index.values())[0].shape[0]  # 300
            self.embed_matrix = self.get_crawl_or_glove_embed_matrix(
                words_dict, embed_index)
        else:
            embed_index = self.load_glove_embed_index(self.embed_file)
            self.embed_dim = list(embed_index.values())[0].shape[0]  # 300
            self.embed_matrix = self.get_crawl_or_glove_embed_matrix(
                words_dict, embed_index)
Beispiel #57
0
full_df.subcat_1 = le.transform(full_df.subcat_1)

le.fit(full_df.subcat_2)
full_df.subcat_2 = le.transform(full_df.subcat_2)

del le

print("Transforming text data to sequences...")
raw_text = np.hstack([full_df.item_description.str.lower(), full_df.name.str.lower(), full_df.category_name.str.lower()])

print("   Fitting tokenizer...")
tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)

print("   Transforming text to sequences...")
full_df['seq_item_description'] = tok_raw.texts_to_sequences(full_df.item_description.str.lower())
full_df['seq_name'] = tok_raw.texts_to_sequences(full_df.name.str.lower())
# full_df['seq_category'] = tok_raw.texts_to_sequences(full_df.category_name.str.lower())

del tok_raw


MAX_NAME_SEQ = 10 #17
MAX_ITEM_DESC_SEQ = 75 #269
MAX_CATEGORY_SEQ = 8 #8
MAX_TEXT = np.max([
    np.max(full_df.seq_name.max()),
    np.max(full_df.seq_item_description.max()),
#     np.max(full_df.seq_category.max()),
]) + 100
MAX_CATEGORY = np.max(full_df.category.max()) + 1
Beispiel #58
0
df["question_text"] = df["question_text"].progress_apply(preprocess)

n_words = len(vocab) + 1
del vocab
emb_file = "glove.840B.300d/glove.840B.300d.txt"
glove_dic = {}
for line in tqdm_notebook(open(emb_file)):
    temp = line.split(" ")
    glove_dic[temp[0]] = np.asarray(temp[1:], dtype="float32")

train, val = train_test_split(df, test_size=0.1)

tokenizer = Tokenizer(num_words=n_words)
tokenizer.fit_on_texts(list(train.question_text))

q_train = tokenizer.texts_to_sequences(train.question_text)
q_val = tokenizer.texts_to_sequences(val.question_text)

max_len = 65
q_train = pad_sequences(q_train, maxlen=max_len)
q_val = pad_sequences(q_val, maxlen=max_len)

y_train = train.target
y_val = val.target

del train, val
word_index = tokenizer.word_index
emb_size = glove_dic["."].shape[0]
emb_matrix = np.zeros((n_words, emb_size))
for w, index in word_index.items():
    if index >= n_words:
    data = normal_data + botnet_data
    data = [x[3:] for x in data if len(x) > 3]
    print ("normal", len(normal_data))
    print ("botnet", len(botnet_data))

    # Split sequences with spaces every 5 characters to convert them to words
    n = 5
    text = []
    for x in data:
        text.append(" ".join([x[i:i+n] for i in range(0, len(x), n)]))

    assert len(text) == len(data)
    tokenizer = Tokenizer(filters=stf_dataset.text_filter(), lower=False)
    tokenizer.fit_on_texts(text)
    seq = tokenizer.texts_to_sequences(text)
    print("text - seq", len(text), len(seq))
    print (tokenizer.word_index)
    print (len(max(seq, key=len)))
    mat = sequence.pad_sequences(seq, maxlen=500)
    _, max_word_index = max(tokenizer.word_index.iteritems(), key=lambda x:x[1])
    print("max word index", max_word_index)
    raw_input("..")
    assert len(data) == len(seq)
    data = zip(mat, y_data)

    # shuffle
    seed(1)
    shuffle(data)

    # split into training and testing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding

df = pd.read_csv('/home/charan/imdb_master.csv', encoding='latin-1')
print(df.head())
sentences = df['review'].values
y = df['label'].values

# tokenizing data
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(sentences)
max_review_len = max([len(s.split()) for s in sentences])
vocab_size = len(tokenizer.word_index) + 1
sentences = tokenizer.texts_to_sequences(sentences)
padded_docs = pad_sequences(sentences, maxlen=max_review_len)

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(padded_docs,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=1000)

print(len(X_train))
# Number of features
# print(input_dim)
model = Sequential()

model.add(Embedding(vocab_size, 50, input_length=max_review_len))