def get_data_1(train_sents, maxlen):
    word_list = []
    for i in range(len(train_sents)):
        for words in train_sents[i]:
            word_list.append(words)
    
    sequence=[]
    stride=1
    #applying windowing for sequence genration

    for i in range(0,len(word_list)-maxlen,stride):
        line=word_list[i:i+maxlen]
        sequence.append(line)
    
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(sequence)
    seq=tokenizer.texts_to_sequences(sequence)
    vocab_len=len(tokenizer.word_index.items())+1
    
    seq=np.array(seq)
    x_train=seq[:,:-1]
    y_train=np.zeros((x_train.shape[0],x_train.shape[1],1))
    for i in range(x_train.shape[0]):
        for j in range(x_train.shape[1]):
            y_train[i,j,0]=seq[i,j+1]
        
    return x_train,y_train,vocab_len,tokenizer
Ejemplo n.º 2
0
 def read_copus_generator(self, batch_size=64):
     """ return a generator with the specified batch_size
     """
     logger.info("Beigin read copus {0}".format(file_name))
     data = []
     index = 0
     with open(file_name, 'r') as fread:
         while True:
             try:
                 line = fread.readline()
                 data.append(line)
                 index += 1
                 if index % 100000 == 0:
                     logger.info("The program has processed {0} lines ".
                                 format(index))
             except:
                 logger.info("Read End")
                 break
     tokenizer = Tokenizer(nb_words=30000)
     tokenizer.fit_on_texts(data)
     logger.info("word num: {0}".format(len(tokenizer.word_counts)))
     sorted_word_counts = sorted(
         tokenizer.word_counts.items(),
         key=operator.itemgetter(1),
         reverse=True)
     # save the word_counts to the meta
     with open(file_name.replace("train.", "meta."), "w") as fwrite:
         for word_cnt in sorted_word_counts:
             key = word_cnt[0]
             val = word_cnt[1]
             line = key + ":" + str(val) + "\n"
             fwrite.write(line)
     vectorize_data = tokenizer.texts_to_matrix(data)
     return vectorize_data
Ejemplo n.º 3
0
def train(dataReader, oneHot, oneHotAveraged, contextHashes):
	n = (Epochs + 1) * SamplesPerEpoch  # TODO + 1 should not be needed

	tokeniser = Tokenizer(nb_words=MaxWords)
	tokeniser.fit_on_texts((row[0] for row in dataReader.trainingData(n)))

	# `word_index` maps each word to its unique index
	dictionarySize = len(tokeniser.word_index) + 1

	oneHotDimension        = (1 if oneHotAveraged else SequenceLength) * dictionarySize if oneHot else 0
	contextHashesDimension = dictionarySize * 2 if contextHashes else 0

	model = Sequential()
	model.add(Dense(EmbeddingDim, input_dim=(oneHotDimension + contextHashesDimension)))
	model.add(Dense(Labels, activation='softmax'))
	model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

	trainingGenerator   = mapGenerator(dataReader.trainingData(n),   tokeniser, dictionarySize, oneHot, oneHotAveraged, contextHashes)
	validationGenerator = mapGenerator(dataReader.validationData(n), tokeniser, dictionarySize, oneHot, oneHotAveraged, contextHashes)

	model.fit_generator(trainingGenerator,
		nb_epoch=Epochs,
		samples_per_epoch=SamplesPerEpoch,
		validation_data=validationGenerator,
		nb_val_samples=SamplesPerEpoch)

	model2 = Sequential()
	model2.add(Dense(EmbeddingDim, input_dim=(oneHotDimension + contextHashesDimension), weights=model.layers[0].get_weights()))

	return model, model2, tokeniser, dictionarySize
def get_fitted_tokenizer(df_train, df_test):
    comments_train = df_train[COMMENT_COL].values.tolist()
    comments_test = df_test[COMMENT_COL].values.tolist()
    tokenizer = Tokenizer()
    # tokenizer.num_words = MAX_NUM_WORDS
    tokenizer.fit_on_texts(comments_train + comments_test)
    return tokenizer
Ejemplo n.º 5
0
def get_train_val_matrix(texts, labels, max_features=10000, max_len=100):
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(texts)
    sequens = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print(f'Found {len(word_index)} unique tokens')

    data = pad_sequences(sequens, maxlen=max_len)

    labels = np.asarray(labels)

    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)

    data = data[indices]
    labels = labels[indices]

    train_sample_n = 20000
    validation_sample_n = 5000

    x_train = data[:train_sample_n]
    x_val = data[train_sample_n:validation_sample_n+train_sample_n]
    y_train = labels[:train_sample_n]
    y_val = labels[train_sample_n:validation_sample_n+train_sample_n]

    return (x_train, y_train), (x_val, y_val), word_index
def prepare_tokenizer(words):
    '''
        funtion to generate vocabulary of the given list of words
        implemented by Anindya
        @param
        words => the list of words to be tokenized
    '''
    # obtain a tokenizer
    t = Tokenizer(filters = '') # don't let keras ignore any words
    t.fit_on_texts(words)
    field_dict = dict(); rev_field_dict = dict()

    for key,value in t.word_index.items():
        field_dict[value] = key
        rev_field_dict[key] = value

    vocab_size = len(t.word_index) + 1

    ''' Small modification from Animesh
        # also add the '<unk>' token to the dictionary at 0th position
    '''
    field_dict[0] = '<unk>'; rev_field_dict['<unk>'] = 0

    #print (vocab_size)
	# integer encode the documents
    encoded_docs = t.texts_to_sequences(words)

    # print "debug: " + str(encoded_docs)

	#print(padded_docs)
    return np.array(encoded_docs), field_dict, rev_field_dict, vocab_size
Ejemplo n.º 7
0
def LoadSMILESData(duplicateProb = 0,seed=7):
    dataComp = dataset.LoadData('data',0)
    smiles = list(map(lambda x: x._SMILE, dataComp))
    tokenizer = Tokenizer(num_words=None, char_level=True)
    tokenizer.fit_on_texts(smiles)
    print(smiles[0])
    dictionary = {}
    i=0
    k=0
    for smile in smiles:
        i+=1
        for c in list(smile):
            k+=1
            if c in dictionary:
                dictionary[c]+=1
            else:
                dictionary[c]=1
    print(len(dictionary))
    # sequence encode
    encoded_docs = tokenizer.texts_to_sequences(smiles)
    # pad sequences
    max_length = max([len(s) for s in smiles])
    vocab = {'C': 1, 'c': 2, '(': 3, ')': 4, 'O': 5, '=': 6, '1': 7, 'N': 8, '2': 9, '3': 10, '[': 11, ']': 12, 'F': 13, '4': 14, 'l': 15, 'n': 16, 'S': 17, '@': 18, 'H': 19, '5': 20, '+': 21, '-': 22, 'B': 23, 'r': 24, '\\': 25, '#': 26, '6': 27, '.': 28, '/': 29, 's': 30, 'P': 31, '7': 32, 'i': 33, 'o': 34, '8': 35, 'I': 36, 'a': 37, '%': 38, '9': 39, '0': 40, 'K': 41, 'e': 42, 'A': 43, 'g': 44, 'p': 45, 'M': 46, 'T': 47, 'b': 48, 'd': 49, 'V': 50, 'Z': 51, 'G': 52, 'L': 53}
    Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    # define vocabulary size (largest integer value)
    labels = list(map(lambda x: 1 if x.mutagen==True else 0,dataComp))
    return Xtrain, labels,vocab,max_length
Ejemplo n.º 8
0
class SequenceTransformer(BaseEstimator, TransformerMixin):
    " Transforms np array of strings into sequences"

    def __init__(self, analyzer='word', max_features=10000, max_len=100):
        self.max_len = max_len
        self.analyzer = analyzer
        self.max_features = max_features
    

    def transform(self, X, y=None):

        try:
            getattr(self, "transformer_")
        except AttributeError:
            raise RuntimeError("You must fit transformer before using it!")

        X_seq = self.transformer_.texts_to_sequences(list(X))
        X_seq = sequence.pad_sequences(X_seq, maxlen=self.max_len)
        return X_seq


    def fit(self, X, y=None):

        if self.analyzer == 'char':
            char_level = True
        elif self.analyzer == 'word':
            char_level = False
        else:
            print("invalid analyzer")
            return

        self.transformer_ = Tokenizer(nb_words=self.max_features, lower=True, char_level = char_level)
        self.transformer_.fit_on_texts(X)

        return self
Ejemplo n.º 9
0
class Featurizer:

    max_words = None
    tokenizer = None

    def __init__(self, max_words=1000):
        self.max_words = max_words
        self.tokenizer = Tokenizer(num_words=max_words)

    def fit_transform(self, data):
        texts = [l['text'] for l in data]
        self.tokenizer.fit_on_texts(texts)
        # remove words that cross the max_words limit
        self.tokenizer.word_index = {k: v for k, v in self.tokenizer.word_index.items() if v <= self.max_words}
        return self.transform(data)

    def transform(self, data):
        texts = [l['text'] for l in data]
        return self.tokenizer.texts_to_matrix(texts, mode='binary')

    def transform_inv(self, m):
        index = {v: k for k, v in self.tokenizer.word_index.items()} # word index by id
        return [[index.get(i) for i in np.nonzero(line)[0] if i in index] for line in m]

    def save(self, filepath):
        with open(filepath + '_word_index.json', 'w') as f:
            f.write(json.dumps(self.tokenizer.word_index))

    @classmethod
    def load(cls, filepath):
        with open(filepath + '_word_index.json', 'r') as f:
            word_index = json.load(f)
            c = cls(max_words=len(word_index))
            c.tokenizer.word_index = word_index
            return c
def preprocess_embedding():
    corpus_train, target, filenames = get_corpus()
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus_train)
    sequences = tokenizer.texts_to_sequences(corpus_train)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    MAX_SEQUENCE_LENGTH = 50
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('/home/flippped/Desktop/xiangmu/baseline/GoogleNews-vectors-negative300.bin.gz', binary=True)
    word2vec_model.init_sims(replace=True)

    # create one matrix for documents words
    EMBEDDING_DIM = 300
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    print embedding_matrix.shape
    for word, i in word_index.items():
            try:
                embedding_vector = word2vec_model[str(word)]
                if embedding_vector is not None:
                    # words not found in embedding index will be all-zeros.
                    embedding_matrix[i] = embedding_vector

            except:
                continue


    return data,target,filenames,embedding_matrix, word_index
Ejemplo n.º 11
0
def get_fitted_tokenizer(df_train, df_test):
    comments_train = df_train[COMMENT_COL].values.tolist()
    comments_test = df_test[COMMENT_COL].values.tolist()
    # remain '!' and '?'
    tokenizer = Tokenizer(filters='"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n')
    tokenizer.fit_on_texts(comments_train + comments_test)
    return tokenizer
Ejemplo n.º 12
0
def test_tokenizer_unicode():
    texts = [u'ali veli kırk dokuz elli',
             u'ali veli kırk dokuz elli veli kırk dokuz']
    tokenizer = Tokenizer(num_words=5)
    tokenizer.fit_on_texts(texts)

    assert len(tokenizer.word_counts) == 5
Ejemplo n.º 13
0
def tokenizeAndGenerateIndex(texts):
    tokenizer = Tokenizer(nb_words=vocab_size)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=maxlen,padding='post')
    return data
Ejemplo n.º 14
0
def tokenaize(train_path, dev_path):
    with open(train_path) as fd:
        data = fd.read()
    with open(dev_path) as fd:
        data += fd.read()
    tokenizer = Tokenizer(split='\t', oov_token='<UNK>')
    tokenizer.fit_on_texts([data])
    return tokenizer
Ejemplo n.º 15
0
    def handle(self, *args, **options):
        ptt = PTT.objects.all()
        ptt_json = PTTSerializer(ptt, many=True).data
        user_comments_times = dict()
        labels_index = 2
        labels = []
        texts = []
        for article in ptt_json:
            pointer = 1 if article['score'] > 0 else 0
            words = jieba.cut(article['contents'])
            for word in words:
                labels.append(pointer)
                texts.append(word)
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(texts)
        sequences = tokenizer.texts_to_sequences(texts)
        data = pad_sequences(sequences, maxlen=self.MAX_SEQUENCE_LENGTH)
        labels = to_categorical(np.asarray(labels))
        print('Shape of data tensor:', data.shape)
        print('Shape of label tensor:', labels.shape)
        print('Token word index:', tokenizer.word_index)
        indices = np.arange(data.shape[0])
        np.random.shuffle(indices)
        data = data[indices]
        labels = labels[indices]
        nb_validation_samples = int(self.VALIDATION_SPLIT * data.shape[0])

        x_train = data[:-nb_validation_samples]
        y_train = labels[:-nb_validation_samples]
        x_val = data[-nb_validation_samples:]
        y_val = labels[-nb_validation_samples:]


        print('Training model.')

        # train a 1D convnet with global maxpooling
        sequence_input = Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32')
        x = Embedding(output_dim=100, input_dim=len(tokenizer.word_index), input_length=self.MAX_SEQUENCE_LENGTH)(sequence_input)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(35)(x)
        x = Flatten()(x)
        x = Dense(128, activation='relu')(x)
        preds = Dense(labels_index, activation='softmax')(x)
        model = Model(sequence_input, preds)
        model.compile(loss='categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['acc'])

        # happy learning!
        model.fit(x_train, y_train, validation_data=(x_val, y_val),
                  nb_epoch=2, batch_size=64)
        score = model.evaluate(x_val, y_val, verbose=0)
        print('Test score:', score[0])
        print('Test accuracy:', score[1])
Ejemplo n.º 16
0
def question_to_input(df_q1,df_q2):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df_q1 + df_q2)
    encoded_1 = tokenizer.texts_to_sequences(df_q1)
    encoded_2 = tokenizer.texts_to_sequences(df_q2)
    question_input_train = sequence.pad_sequences(encoded_1, maxlen=15)
    question_input_test = sequence.pad_sequences(encoded_2, maxlen=15)

    return question_input_train,question_input_test
Ejemplo n.º 17
0
def tokenize(texts, texts_train, texts_test):
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
    tokenizer.fit_on_texts(texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    sequences_train = tokenizer.texts_to_sequences(texts_train)
    sequences_test = tokenizer.texts_to_sequences(texts_test)
    return word_index, sequences_train, sequences_test
Ejemplo n.º 18
0
def get_tokenizer(train_comments, nwords):
    print("getting tokenizer..")
    
    t = Tokenizer(num_words=nwords)
    texts = train_comments
    t.fit_on_texts(texts)
    sequences = t.texts_to_sequences(texts)
    
    return (t,sequences)
def fit_tokenizer(fname, open_encoding='utf-8'):
    file = open(fname, 'r', encoding=open_encoding)
    text = file.read()
    file.close()
    texts = [text]
    # 不过滤低频词
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    return tokenizer
Ejemplo n.º 20
0
def keras_classify(df):
    # 预处理,把 text 中的词转成数字编号
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing import sequence
    from keras.callbacks import EarlyStopping
    from sklearn.cross_validation import train_test_split

    print "----- Classification by Keras -----"
    max_features = 50000  # 只选最重要的词
    # Tokenizer 只能处理 str,不能处理 unicode
    textraw = map(lambda x: x.encode('utf-8'), df.seg_word.values.tolist())
    token = Tokenizer(nb_words=max_features)
    # 由于 df.seg_word 以空格相隔,故此这里 Tokenizer 直接按英文方式处理 str 即可完成分词
    token.fit_on_texts(textraw)
    # token 中记录了每个词的编号和出现次数,这里使用词编号来代替 textraw 中的词文本
    # 如 textraw = ['a b c', 'c d e f']  ==> text_seq = [[1, 2, 3], [3, 4, 5, 6]]
    text_seq = token.texts_to_sequences(textraw)
    nb_classes = len(np.unique(df.label.values))
    print "num of features(vocabulary): ", len(token.word_counts)
    print "num of labels: ", nb_classes
    max_sent_len = np.max([len(s) for s in text_seq])
    print "max length or document is: ", max_sent_len
    median_sent_len = np.median([len(s) for s in text_seq])
    print "median length or document is: ", median_sent_len

    # 这里的 df.label.values 中 values 不能忽略,否则后面 np_utils.to_categorical 时会出错
    train_X, test_X, train_y, test_y = train_test_split(text_seq, df.label.values, train_size=0.7, random_state=1)
    # 目前 train_X & test_X 仍然不是等长的,其每行都是一个 document,需要化为等长的矩阵才能训练
    seqlen = int(max_sent_len / 2 + median_sent_len / 2)
    X_train = sequence.pad_sequences(train_X, maxlen=seqlen, padding='post', truncating='post')
    X_test = sequence.pad_sequences(test_X, maxlen=seqlen, padding='post', truncating='post')
    # 把 y 格式展开为 one-hot,目的是在 nn 的最后采用 softmax
    Y_train = np_utils.to_categorical(train_y, nb_classes)
    Y_test = np_utils.to_categorical(test_y, nb_classes)

    model = build_cnn_model(max_features, seqlen, nb_classes)
    earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)
    # 训练 10 轮,每轮 mini_batch 为 32,训练完调用 earlystop 查看是否已经 ok
    model.fit(X_train, Y_train, batch_size=32, nb_epoch=10, validation_split=0.1, callbacks=[earlystop])
    evaluate(earlystop.model, X_test, Y_test, test_y)

    model = build_lstm_model(max_features, seqlen, nb_classes)
    model.fit(X_train, Y_train, batch_size=32, nb_epoch=1, validation_split=0.1)
    evaluate(model, X_test, Y_test, test_y)

    model = build_mixed_model(max_features, seqlen, nb_classes)
    model.fit(X_train, Y_train, batch_size=32, nb_epoch=1, validation_split=0.1)
    evaluate(model, X_test, Y_test, test_y)

    graph = build_graph_model(max_features, seqlen, nb_classes)
    graph.fit({'input': X_train, 'output': Y_train}, nb_epoch=3, batch_size=32, validation_split=0.1)
    predict = graph.predict({'input': X_test}, batch_size=32)
    predict = predict['output']
    classes = predict.argmax(axis=1)
    acc = np_utils.accuracy(classes, test_y)
    print('Test accuracy: ', acc)
Ejemplo n.º 21
0
def df2seq(df, nb_words):

    textraw = df.EssayText.values.tolist()
    textraw = [line.encode('utf-8') for line in textraw]  # keras needs str

    # keras deals with tokens
    token = Tokenizer(nb_words=nb_words)
    token.fit_on_texts(textraw)
    text_seq = token.texts_to_sequences(textraw)
    return(text_seq, df.Score1.values)
Ejemplo n.º 22
0
def tokenizeAndGenerateIndex(train, test, maxFeatures, maxLength):
  merged = np.concatenate([train, test])
  tokenizer = Tokenizer(nb_words=maxFeatures)
  tokenizer.fit_on_texts(merged)
  sequences_train = tokenizer.texts_to_sequences(train)
  sequences_test = tokenizer.texts_to_sequences(test)
  word_index = tokenizer.word_index
  print('Found %s unique tokens.' % len(word_index))
  data_train = pad_sequences(sequences_train, maxlen=maxLength)
  data_test = pad_sequences(sequences_test, maxlen=maxLength)
  return data_train, data_test, word_index
def prepare_tokenizer(words, max_word_length = None):
    '''
        funtion to generate vocabulary of the given list of words
        implemented by Anindya
        @param
        words => the list of words to be tokenized
    '''
    # flatten the words list:
    print("flattening the words into a single sequence ... ")
    flat_words = []; # initialize to empty list
    for i in range(len(words)):
        flat_words += words[i]
        if(i % 10000 == 0):
            print("joined", i, "examples")

    # obtain a tokenizer
    print("\nmaximum words to work with: ", max_word_length)
    t = Tokenizer(num_words = max_word_length, filters = '') # don't let keras ignore any words
    print("\nKeras's tokenizer kicks off ... ")
    t.fit_on_texts(flat_words)
    field_dict = dict(); rev_field_dict = dict()

    print("\nbuilding the dict and the rev_dict ... ")
    if(max_word_length is not None):
        vals = t.word_index.items()
        vals = sorted(vals, key=lambda x: x[1])
        for key,value in vals[:max_word_length - 1]:
            field_dict[value] = key
            rev_field_dict[key] = value
    else:
        for key,value in t.word_index.items():
            field_dict[value] = key
            rev_field_dict[key] = value


    ''' Small modification from Animesh
        # also add the '<unk>' token to the dictionary at 0th position
    '''
    field_dict[0] = '<unk>'; rev_field_dict['<unk>'] = 0


    print("\nencoding the words using the dictionary ... ")
    for i in range(len(words)):
        for j in range(len(words[i])):
            if(words[i][j] in rev_field_dict):
                words[i][j] = rev_field_dict[words[i][j]]
            else:
                words[i][j] = rev_field_dict['<unk>']

        if(i % 10000 == 0):
            print("encoded", i, "examples")

    vocab_size = len(field_dict)
    return words, field_dict, rev_field_dict, vocab_size
Ejemplo n.º 24
0
 def word_to_index(self, text, tok=None):
     real_text = [' '.join(z) for z in text]
     if tok is None:
         tokenizer = Tokenizer(lower=False, filters=" ")
         tokenizer.fit_on_texts(real_text)
     else:
         tokenizer = tok
     # here do not need the loop, just put the list of sentences (str) as input
     sequences = tokenizer.texts_to_sequences(real_text)
     # tokenizer.word_docs.items()
     return sequences, tokenizer
Ejemplo n.º 25
0
def save_tokenizer(question1, question2):
    questions = question1 + question2
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(questions)
    word_index = tokenizer.word_index
    print("Words in index: %d" % len(word_index))

    # save tokenizer
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return word_index
Ejemplo n.º 26
0
def load_mr(nb_words=20000, maxlen=64, embd_type='self'):
    """
    :param embd_type: self vs. w2v
    :return:
    """

    train_size = 0.8

    df = pickled2df('data/mr.p')
    print(df.head())

    train_X, test_X, train_y, test_y = train_test_split(df.text.values.tolist(),
                                                        df.label.values,
                                                        train_size=train_size, random_state=1)
    train_X_wds = train_X
    test_X_wds = test_X

    nb_classes = len(np.unique(train_y))
    Y_train = np_utils.to_categorical(train_y, nb_classes)
    Y_test  = np_utils.to_categorical(test_y, nb_classes)

    # tokenrize should be applied on train+test jointly
    n_ta = len(train_X)
    n_ts = len(test_X)
    print('train len vs. test len', n_ta, n_ts)

    textraw = [line.encode('utf-8') for line in train_X+test_X]  # keras needs str
    # keras deals with tokens
    token = Tokenizer(nb_words=nb_words)
    token.fit_on_texts(textraw)
    textseq = token.texts_to_sequences(textraw)

    # stat about textlist
    print('nb_words: ',len(token.word_counts))
    print('mean len: ',np.mean([len(x) for x in textseq]))

    train_X = textseq[0:n_ta]
    test_X = textseq[n_ta:]

    if(embd_type == 'self'):
        X_train = xcol_nninput_embd(train_X, nb_words, maxlen)
        X_test  = xcol_nninput_embd(test_X,  nb_words, maxlen)
    elif(embd_type == 'w2v'):
        w2v = load_w2v('data/Google_w2v.bin')
        print("loaded Google word2vec")
        X_train = sents_3dtensor(train_X_wds, maxlen, w2v)
        X_test  = sents_3dtensor(test_X_wds, maxlen, w2v)
    else:
        print('wrong embd_type')

    print('X tensor shape: ', X_train.shape)
    print('Y tensor shape: ', Y_train.shape)
    return (X_train, Y_train, X_test, Y_test, nb_classes)
Ejemplo n.º 27
0
def load_csvs(traincsv, testcsv, nb_words, maxlen, embd_type):

    train_df = pd.read_csv(traincsv)
    test_df = pd.read_csv(testcsv)
    print(train_df.head())

    train_X = train_df.text.values.tolist()
    test_X = test_df.text.values.tolist()

    # save for w2v embd
    train_X_wds = train_X
    test_X_wds = test_X

    train_y = train_df.label.values
    test_y  = test_df.label.values
    nb_classes = len(np.unique(train_y))
    Y_train = np_utils.to_categorical(train_y, nb_classes)
    Y_test = np_utils.to_categorical(test_y, nb_classes)

    # tokenrize should be applied on train+test jointly
    n_ta = len(train_X)
    n_ts = len(test_X)
    print('train len vs. test len', n_ta, n_ts)

    textraw = [line.encode('utf-8') for line in train_X+test_X]  # keras needs str
    # keras deals with tokens
    token = Tokenizer(nb_words=nb_words)
    token.fit_on_texts(textraw)
    textseq = token.texts_to_sequences(textraw)

    # stat about textlist
    print('nb_words: ', len(token.word_counts))
    print('mean len: ', np.mean([len(x) for x in textseq]))

    train_X = textseq[0:n_ta]
    test_X = textseq[n_ta:]

    if(embd_type == 'self'):
        X_train = sequence.pad_sequences(train_X, maxlen=maxlen, padding='post', truncating='post')
        X_test = sequence.pad_sequences(test_X, maxlen=maxlen, padding='post', truncating='post')
    elif(embd_type == 'w2v'):
        w2v = load_w2v('data/Google_w2v.bin')
        print("loaded Google word2vec")
        X_train = sents_3dtensor(train_X_wds, maxlen, w2v)
        X_test = sents_3dtensor(test_X_wds, maxlen, w2v)
    else:
        print('wrong embd_type')

    print('X tensor shape: ', X_train.shape)
    print('Y tensor shape: ', Y_train.shape)
    return(X_train, Y_train, X_test, Y_test, nb_classes)
Ejemplo n.º 28
0
 def ch_to_index(self, text, tok=None):
     sequences = []
     if tok is None:
         tokenizer = Tokenizer(lower=False, char_level=True)
         all_of_them = [' '.join(z) for z in text]
         tokenizer.fit_on_texts(all_of_them)
     else:
         tokenizer = tok
     for words in text:
         charaters = []
         for ch in tokenizer.texts_to_sequences_generator(words):
             charaters.append(ch)
         sequences.append(charaters)
     return sequences, tokenizer
Ejemplo n.º 29
0
def load_csvs(traincsv, testcsv, nb_words, maxlen, embd_type, w2v):

    train_df = pd.read_csv(traincsv)
    test_df = pd.read_csv(testcsv)
    print(train_df.head())

    train_X = train_df.text.values.tolist()
    test_X = test_df.text.values.tolist()

    # save for w2v embd
    train_X_wds = train_X
    test_X_wds = test_X

    train_y = train_df.label.values
    test_y = test_df.label.values
    nb_classes = len(np.unique(train_y))
    Y_train = np_utils.to_categorical(train_y, nb_classes)
    Y_test = np_utils.to_categorical(test_y, nb_classes)

    # tokenrize should be applied on train+test jointly
    n_ta = len(train_X)
    n_ts = len(test_X)
    print("train len vs. test len", n_ta, n_ts)

    textraw = [line.encode("utf-8") for line in train_X + test_X]  # keras needs str
    # keras deals with tokens
    token = Tokenizer(nb_words=nb_words)
    token.fit_on_texts(textraw)
    textseq = token.texts_to_sequences(textraw)

    # stat about textlist
    print("nb_words: ", len(token.word_counts))
    print("mean len: ", np.mean([len(x) for x in textseq]))

    train_X = textseq[0:n_ta]
    test_X = textseq[n_ta:]

    if embd_type == "self":
        X_train = sequence.pad_sequences(train_X, maxlen=maxlen, padding="post", truncating="post")
        X_test = sequence.pad_sequences(test_X, maxlen=maxlen, padding="post", truncating="post")
    elif embd_type == "w2v":
        X_train = sents_3dtensor(train_X_wds, maxlen, w2v)
        X_test = sents_3dtensor(test_X_wds, maxlen, w2v)
    else:
        print("wrong embd_type")

    print("X tensor shape: ", X_train.shape)
    print("Y tensor shape: ", Y_train.shape)
    return (X_train, Y_train, X_test, Y_test, nb_classes)
def word_freq(lines):
    """ 返回 DataFrame,按词频倒序排列
        这个是词频统计,其实没有使用,用的是下面的字符频率统计函数
    """
    # default filter is base_filter(), which is '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    # 这样的话,比如 a-b-c 不会被当作一个词,而会被当作 a b c 三个词看待
    # 另外注意,不设置上限 nb_words
    token = Tokenizer(filters='')
    # token 只能接受 str 不能接受 unicode
    token.fit_on_texts(map(lambda x: x.encode('utf-8'), lines))
    wc = token.word_counts
    df = pd.DataFrame({'word': map(lambda x: x.decode('utf-8'), wc.keys()), 'freq': wc.values()})
    df.sort('freq', ascending=False, inplace=True)
    df['idx'] = np.arange(len(wc))
    return df
Ejemplo n.º 31
0
vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", ngram_range=(1, 1))
vectorizer.fit(datas_word_train)
x_train_tfidf = vectorizer.transform(datas_word_train)
x_dev_tfidf = vectorizer.transform(datas_word_dev)
x_test_tfidf = vectorizer.transform(datas_word_test)
print("x_train_tfidf\t\tshape=(%s, %s)" %
      (x_train_tfidf.shape[0], x_train_tfidf.shape[1]))
print("x_dev_tfidf\t\tshape=(%s, %s)" %
      (x_dev_tfidf.shape[0], x_dev_tfidf.shape[1]))
print("x_test_tfidf\t\tshape=(%s, %s)" %
      (x_test_tfidf.shape[0], x_test_tfidf.shape[1]))
print()

# keras extract feature
tokenizer = Tokenizer()
tokenizer.fit_on_texts(datas_word_train)
# feature1: count
x_train_count = tokenizer.texts_to_matrix(datas_word_train, mode='count')
x_dev_count = tokenizer.texts_to_matrix(datas_word_dev, mode='count')
x_test_count = tokenizer.texts_to_matrix(datas_word_test, mode='count')
print("x_train_count\t\tshape=(%s, %s)" %
      (x_train_count.shape[0], x_train_count.shape[1]))
print("x_dev_count\t\tshape=(%s, %s)" %
      (x_dev_count.shape[0], x_dev_count.shape[1]))
print("x_test_count\t\tshape=(%s, %s)" %
      (x_test_count.shape[0], x_test_count.shape[1]))
print()

# feature2: binary
x_train_binary = tokenizer.texts_to_matrix(datas_word_train, mode='binary')
x_dev_binary = tokenizer.texts_to_matrix(datas_word_dev, mode='binary')
BATCH_SIZE = 128
NUM_EPOCHS = 20

lines = []
fin = open("../data/alice_in_wonderland.txt", "rb")
for line in fin:
    line = line.strip().decode("ascii", "ignore").encode("utf-8")
    if len(line) == 0:
        continue
    lines.append(line)
fin.close()

sents = nltk.sent_tokenize(" ".join(lines))

tokenizer = Tokenizer(5000)  # use top 5000 words only
tokens = tokenizer.fit_on_texts(sents)
vocab_size = len(tokenizer.word_index) + 1

w_lefts, w_centers, w_rights = [], [], []
for sent in sents:
    embedding = one_hot(sent, vocab_size)
    triples = list(nltk.trigrams(embedding))
    w_lefts.extend([x[0] for x in triples])
    w_centers.extend([x[1] for x in triples])
    w_rights.extend([x[2] for x in triples])

ohe = OneHotEncoder(n_values=vocab_size)
Xleft = ohe.fit_transform(np.array(w_lefts).reshape(-1, 1)).todense()
Xright = ohe.fit_transform(np.array(w_rights).reshape(-1, 1)).todense()
X = (Xleft + Xright) / 2.0
Y = ohe.fit_transform(np.array(w_centers).reshape(-1, 1)).todense()
Ejemplo n.º 33
0
def load_data(debug=False):
    if (os.path.exists(TRAIN_PICKLE) and os.path.exists(TEST_PICKLE)
            and os.path.exists(DEV_PICKLE)):

        with open(TRAIN_PICKLE, 'rb') as fp:
            X_train_1, X_train_2, Y_train = pickle.load(fp)
        with open(TEST_PICKLE, 'rb') as fp:
            X_test_1, X_test_2, Y_test = pickle.load(fp)
        with open(DEV_PICKLE, 'rb') as fp:
            X_dev_1, X_dev_2, Y_dev = pickle.load(fp)

    else:
        x_train_1, x_train_2, y_train = [], [], []
        x_test_1, x_test_2, y_test = [], [], []
        x_dev_1, x_dev_2, y_dev = [], [], []

        with open("snli_1.0_train.jsonl", encoding='utf8') as fp:
            for line in fp:
                try:
                    x_1, x_2, y = _formatting(line)
                    x_train_1.append(x_1)
                    x_train_2.append(x_2)
                    y_train.append(y)
                except KeyError:
                    continue

        with open("snli_1.0_test.jsonl", encoding='utf8') as fp:
            for line in fp:
                try:
                    x_1, x_2, y = _formatting(line)
                    x_test_1.append(x_1)
                    x_test_2.append(x_2)
                    y_test.append(y)
                except KeyError:
                    continue

        with open("snli_1.0_dev.jsonl", encoding='utf8') as fp:
            for line in fp:
                try:
                    x_1, x_2, y = _formatting(line)
                    x_dev_1.append(x_1)
                    x_dev_2.append(x_2)
                    y_dev.append(y)
                except KeyError:
                    continue

        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(x_train_1)
        tokenizer.fit_on_texts(x_train_2)
        tokenizer.fit_on_texts(x_test_1)
        tokenizer.fit_on_texts(x_test_2)
        tokenizer.fit_on_texts(x_dev_1)
        tokenizer.fit_on_texts(x_dev_2)

        X_train_1 = tokenizer.texts_to_sequences(x_train_1)
        X_train_2 = tokenizer.texts_to_sequences(x_train_2)
        X_test_1 = tokenizer.texts_to_sequences(x_test_1)
        X_test_2 = tokenizer.texts_to_sequences(x_test_2)
        X_dev_1 = tokenizer.texts_to_sequences(x_dev_1)
        X_dev_2 = tokenizer.texts_to_sequences(x_dev_2)

        MAX_SEQUENCE_LENGTH = max([
            len(seq) for seq in X_train_1 + X_train_2 + X_test_1 + X_test_2 +
            X_dev_1 + X_dev_2
        ])
        # print(X_train_1 + X_train_2 + X_test_1 + X_test_2 + X_dev_1 + X_dev_2)
        MAX_NB_WORDS = len(tokenizer.word_index) + 1

        if debug:
            print("MAX_SEQUENCE_LENGTH: {}".format(MAX_SEQUENCE_LENGTH))
            print("MAX_NB_WORDS: {}".format(MAX_NB_WORDS))

        X_train_1 = pad_sequences(X_train_1, maxlen=MAX_SEQUENCE_LENGTH)
        X_train_2 = pad_sequences(X_train_2, maxlen=MAX_SEQUENCE_LENGTH)
        X_test_1 = pad_sequences(X_test_1, maxlen=MAX_SEQUENCE_LENGTH)
        X_test_2 = pad_sequences(X_test_2, maxlen=MAX_SEQUENCE_LENGTH)
        X_dev_1 = pad_sequences(X_dev_1, maxlen=MAX_SEQUENCE_LENGTH)
        X_dev_2 = pad_sequences(X_dev_2, maxlen=MAX_SEQUENCE_LENGTH)

        Y_train = np_utils.to_categorical(y_train, NB_CLASSES)
        Y_test = np_utils.to_categorical(y_test, NB_CLASSES)
        Y_dev = np_utils.to_categorical(y_dev, NB_CLASSES)

        with open(TRAIN_PICKLE, 'wb') as fp:
            pickle.dump((X_train_1, X_train_2, Y_train), fp)
        with open(TEST_PICKLE, 'wb') as fp:
            pickle.dump((X_test_1, X_test_2, Y_test), fp)
        with open(DEV_PICKLE, 'wb') as fp:
            pickle.dump((X_dev_1, X_dev_2, Y_dev), fp)

        with open(TOKENIZER_PICKLE, 'wb') as fp:
            pickle.dump(tokenizer, fp)

    return (X_train_1, X_train_2, Y_train, X_test_1, X_test_2, Y_test, X_dev_1,
            X_dev_2, Y_dev)
Ejemplo n.º 34
0
text_list = []
count = 1
my_dict = {}
POS_TAG_SIZE = 14
Epoch = [3, 5]
Batch_size = [16, 32]
max_length = 40  #length of longest sentence
seed = 7
Embedding_Dim = 100
NUM_WORDS = 50000

# the file "g" is for reading the labels where as "f" have texts(tweets)
with open("File_name.txt", "r") as f:
    texts = f.readlines()
tokenizer = Tokenizer(NUM_WORDS)
tokenizer.fit_on_texts(texts)  # we fit tokenizer on texts we will process
sequences = tokenizer.texts_to_sequences(
    texts)  # here the conversion to tokens happens
word_index = tokenizer.word_index
invert = dict(map(reversed, word_index.items()))
#Text data for training with word embeddings features
data = pad_sequences(sequences, maxlen=max_length, padding='post')
with open("POS_Tag.txt", "r") as k, open("POS_Tag.txt", "r") as h:
    for line in k:
        line = line.lower()
        line = line.split()
        text_list.append(line)
        count += 1
    texts_1 = h.readlines()

tokenizer_POS = Tokenizer(POS_TAG_SIZE)
Ejemplo n.º 35
0
 def get_tokenizer(self):
     tokenizer = Tokenizer()
     phoneme_list = get_phoneme_list()
     tokenizer.fit_on_texts(phoneme_list)
     return tokenizer
Ejemplo n.º 36
0
from keras.preprocessing.text import Tokenizer

text = '나는 맛있는 밥을 먹었다'

token = Tokenizer()  # 한개의 문장을 단어 단위로 잘라서 인덱싱(수치화)을 걸어 줌
token.fit_on_texts([text])

print(token.word_index)  # {'나는': 1, '맛있는': 2, '밥을': 3, '먹었다': 4}

x = token.texts_to_sequences([text])
print(x)  # [[1, 2, 3, 4]]
# 문제점 : '나는'과 '먹었다'의 가치가 다르다.

from keras.utils import to_categorical

word_size = len(token.word_index) + 1  # [0]추가
x = to_categorical(x, num_classes=word_size)
print(x)
# [[[0. 1. 0. 0. 0.]                  # 문제점 : 단어 수가 많아지면 data(컬럼)이 너무 많아짐
#   [0. 0. 1. 0. 0.]
#   [0. 0. 0. 1. 0.]
#   [0. 0. 0. 0. 1.]]]
Ejemplo n.º 37
0
y_shuffle = y_rt[shuffled_rt]
print('X:', x_shuffle)
print('Y:', y_shuffle)
print(
    pairwise2.align.globalxx(x_shuffle[0],
                             x_shuffle[1],
                             one_alignment_only=True))

x_train, x_valid, y_train, y_valid = train_test_split(x_shuffle,
                                                      y_shuffle,
                                                      stratify=y_shuffle,
                                                      test_size=0.2)

print('x shape:', x_train.shape)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(get_vocab('atcgx'))
V = len(tokenizer.word_index) + 1
print('Num Words:', V)

#alignments2vec(x_train, y_train, V, tokenizer) #uncomment to train word2vec representation
'''
model = Sequential()
'''
'''
model.add(Conv1D(filters=64, kernel_size=word_length, input_shape=(None, word_length)))
model.add(Activation('relu'))
model.add(Conv1D(filters=64, kernel_size=3))
model.add(Activation('relu'))
model.add(MaxPooling1D(3))
model.add(Conv1D(128, 3, activation='relu'))
model.add(Conv1D(128, 3, activation='relu'))
xf = xf.sample(frac=1)
train_data = xf[:900]
test_data = xf[900:]
df = train_data

x = len(df['Department'].unique())

MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 559
EMBEDDING_DIM = 300
EMBEDDING_FILE = "../GoogleNews-vectors-negative300.bin"

## Tokenizing and ppadding the data
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(df['Description'])
description_sequence = tokenizer.texts_to_sequences(df['Description'])
description_data = pad_sequences(description_sequence,MAX_SEQUENCE_LENGTH)
word_index = tokenizer.word_index

## Encoding the output labels
le = LabelEncoder()
df['target'] = le.fit_transform(df['Department'])
category = to_categorical(df['target'])
data = description_data



VALIDATION_SPLIT = 0.4
indices = np.arange(data.shape[0]) # get sequence of row index
np.random.shuffle(indices) # shuffle the row indexes
    def testEmbeddingLayer20NewsGroup(self):
        """
        Test Keras 'Embedding' layer returned by 'get_embedding_layer' function
        for a smaller version of the 20NewsGroup classification problem.
        """
        MAX_SEQUENCE_LENGTH = 1000

        # Prepare text samples and their labels

        # Processing text dataset
        texts = []  # list of text samples
        texts_w2v = []  # used to train the word embeddings
        labels = []  # list of label ids

        data = fetch_20newsgroups(
            subset='train',
            categories=['alt.atheism', 'comp.graphics', 'sci.space'])
        for index in range(len(data)):
            label_id = data.target[index]
            file_data = data.data[index]
            i = file_data.find('\n\n')  # skip header
            if i > 0:
                file_data = file_data[i:]
            try:
                curr_str = str(file_data)
                sentence_list = curr_str.split('\n')
                for sentence in sentence_list:
                    sentence = (sentence.strip()).lower()
                    texts.append(sentence)
                    texts_w2v.append(sentence.split(' '))
                    labels.append(label_id)
            except Exception:
                pass

        # Vectorize the text samples into a 2D integer tensor
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(texts)
        sequences = tokenizer.texts_to_sequences(texts)

        # word_index = tokenizer.word_index
        data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
        labels = to_categorical(np.asarray(labels))

        x_train = data
        y_train = labels

        # prepare the embedding layer using the wrapper
        keras_w2v = self.model_twenty_ng
        keras_w2v.build_vocab(texts_w2v)
        keras_w2v.train(texts,
                        total_examples=keras_w2v.corpus_count,
                        epochs=keras_w2v.epochs)
        keras_w2v_wv = keras_w2v.wv
        embedding_layer = keras_w2v_wv.get_keras_embedding()

        # create a 1D convnet to solve our classification task
        sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
        embedded_sequences = embedding_layer(sequence_input)
        x = Conv1D(128, 5, activation='relu')(embedded_sequences)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(35)(x)  # global max pooling
        x = Flatten()(x)
        x = Dense(128, activation='relu')(x)
        preds = Dense(y_train.shape[1], activation='softmax')(x)

        model = Model(sequence_input, preds)
        model.compile(loss='categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['acc'])
        fit_ret_val = model.fit(x_train, y_train, epochs=1)

        # verify the type of the object returned after training
        # value returned is a `History` instance.
        # Its `history` attribute contains all information collected during training.
        self.assertTrue(type(fit_ret_val) == keras.callbacks.History)
Ejemplo n.º 40
0
    string = re.sub(r"\"", "", string)
    return string.strip().lower()


input_data = pd.read_csv('labeledTrainData.tsv', sep='\t')

for idx in range(input_data.review.shape[0]):
    text = BeautifulSoup(input_data.review[idx], features="html5lib")
    text = clean_str(text.get_text().encode('ascii', 'ignore'))
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    reviews.append(sentences)
    labels.append(input_data.sentiment[idx])

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

data = np.zeros((len(texts), max_sentences, maxlen), dtype='int32')

for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j < max_sentences:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < maxlen and tokenizer.word_index[word] < max_words:
                    data[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))
Ejemplo n.º 41
0
                all_texts[filename] = f.read().replace("\n", "").lower()
        except:
            with codecs.open(path.join(text_path, filename),
                             encoding='latin-1') as f:
                all_texts[filename] = f.read().replace("\n", "").lower()
    return all_texts


all_texts = get_all_texts()
pattern = re.compile('[\W_]+')
all_texts_cleaned = {
    host: pattern.sub(' ', text)
    for host, text in all_texts.items()
}
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(all_texts_cleaned.values())

try:
    wget.download(
        "http://embeddings.net/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin",
        'frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin')
except:
    pass

embeds = KeyedVectors.load_word2vec_format(
    'frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin', binary=True)
word_index = tokenizer.word_index
embedding_matrix = np.zeros((max_words, d))

for word, i in word_index.items():
    if i >= max_words:
Ejemplo n.º 42
0
from keras import layers
import matplotlib.pyplot as plt
from keras import optimizers
from keras.preprocessing.text import Tokenizer


amount=60000
newsData = pd.read_json('../../_data/News_Category_Dataset_v2.json', lines=True)
newsData=newsData.drop(columns=['date','link'],axis=1)
newsData= newsData.dropna(how='any')


newsData.category=newsData.category.map(lambda x: "WORLDPOST" if x == "THE WORLDPOST" else x)
newsData['text'] = newsData.headline + " " + newsData.short_description
tokenizer = Tokenizer()
tokenizer.fit_on_texts(newsData.text)
newsData['words'] = tokenizer.texts_to_sequences(newsData.text)
# print(newsData.loc[:100,'words'])

def vectorize_sequences(sequences):
    dimension=10000
    results = np.zeros((amount, dimension))
    for i in range(amount):
        for k in sequences[i]:
            if(k<10000):
                results[i,k] = 1.
    return results

inputData=vectorize_sequences(newsData.words)
print(inputData[:4,:10])
print(newsData.words[:4])
Ejemplo n.º 43
0
from keras.callbacks import TensorBoard
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('Sentiment.csv')
# Keeping only the neccessary columns
data = data[['text', 'sentiment']]

data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)

X = pad_sequences(X)

embed_dim = 128
lstm_out = 196


def createmodel():
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim, input_length=X.shape[1]))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
Ejemplo n.º 44
0
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing

samples = ['种植 牙 牙周病 治疗 修复 及口 内 治疗', '乳腺 肿瘤 及 乳房 整形 领域 的 手术 消化系统 等 常见 肿瘤 的 诊治']
tokenizer = Tokenizer(num_words=20)  # i创建一个分词器(tokenizer),设置为只考虑前1000个最常见的单词
tokenizer.fit_on_texts(samples)  # 构建索引单词
sequences = tokenizer.texts_to_sequences(samples)  # 将字符串转换为整数索引组成的列表
print(sequences)
one_hot_results = tokenizer.texts_to_matrix(
    samples, mode='binary')  # 可以直接得到one-hot二进制表示。这个分词器也支持除
print(one_hot_results.tolist())

# one-hot编码外其他向量化模式
word_index = tokenizer.word_index  # 得到单词索引
print('Found %s unique tokens.' % len(word_index))
Ejemplo n.º 45
0
            new_prob = new_prob.replace(curr_entities[j], repl_entity)
        if (new_prob not in all_problems):
            all_problems.append(new_prob)
            tr_x += [x[i]]
            tr_y += [new_prob]
        if (new_prob1 not in all_problems):
            all_problems.append(new_prob1)
            tr_x += [x[i]]
            tr_y += [new_prob]

    return tr_x, tr_y, all_problems


tr_x, tr_y, all_problems = create_data(data)
tokenizer = Tokenizer(nb_words=100, lower=True, split=' ')
tokenizer.fit_on_texts(all_problems)
#print(tokenizer.word_index)  # To see the dicstionary

TR_X = tokenizer.texts_to_sequences(tr_x)
TR_X = pad_sequences(TR_X, maxlen=40)

TR_Y = tokenizer.texts_to_sequences(tr_y)
TR_Y = pad_sequences(TR_Y, maxlen=40)

encode_data = tokenizer.texts_to_sequences(data)
encode_data = pad_sequences(encode_data, maxlen=40)

word_index = tokenizer.word_index

embeddings_index = {}
f = open('glove.6B.50d.txt')
Ejemplo n.º 46
0
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
    for file_name in os.listdir(dir_name):
        if file_name[-4:] == '.txt':
            f = open(os.path.join(dir_name, file_name), encoding='utf8')
            texts.append(f.read())
            f.close()
            labels.append(label_type == 'pos')
# endregion

# region tokenizing the text
max_length = 100
training_samples = 2000
validation_samples = 10000
max_words = 10000

tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(texts)
sequences = tok.texts_to_sequences(texts)

word_index = tok.word_index
print('Found %s unique token.' % len(word_index))

# zero padding
data = pad_sequences(sequences, maxlen=max_length)

labels = np.asarray(labels)
print('Shape of data tensor : ', data.shape)
print('Shape of label tensor : ', labels.shape)

# shuffle the data/labels
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
Ejemplo n.º 48
0
    df = pd.DataFrame({"Reviews": Train_data_cleaned, "Labels": y_labels})

    # 2. Train your network

    # Training the word2vec model

    word_sentences = [
        nltk.word_tokenize(sentence) for sentence in df["Reviews"]
    ]
    W2v = Word2Vec(word_sentences, size=400, window=10, min_count=10)
    embedding_vectors = W2v.wv.vectors

    # USING KERAS PREPROCESSING FOr CREATING INTO VECTORS
    tokens = Tokenizer(num_words=embedding_vectors.shape[0])
    tokens.fit_on_texts(df["Reviews"])
    pkl.dump(tokens, open("models/tokens.pkl", "wb"))

    encoded_docs_train = tokens.texts_to_sequences(df["Reviews"])
    max_length = 450
    padded_docs = pad_sequences(encoded_docs_train,
                                maxlen=max_length,
                                padding='pre')

    y_train = np.array(df["Labels"])

    embedding_layer = Embedding(input_dim=embedding_vectors.shape[0],
                                output_dim=embedding_vectors.shape[1],
                                weights=[embedding_vectors],
                                trainable=True,
                                input_length=450)
Ejemplo n.º 49
0
for sentence in df["comment"]:
    seg_list = jieba.cut(sentence.replace(" ", ""), cut_all=False)
    x_word.write(" ".join(seg_list).encode('utf-8'))
    x_word.write(b'\n')

x_word.close()

x_word = list()
f = open(wordlist_path, "r")
for line in f:
    x_word.append(line[:-1])

MAX_SEQUENCE_LENGTH = 100

tokenizer = Tokenizer(num_words=None)
tokenizer.fit_on_texts(x_word)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

count_thres = 3
low_count_words = [
    w for w, c in tokenizer.word_counts.items() if c < count_thres
]
for w in low_count_words:
    del tokenizer.word_index[w]
    del tokenizer.word_docs[w]
    del tokenizer.word_counts[w]
sequences = tokenizer.texts_to_sequences(x_word)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
Ejemplo n.º 50
0
# conv1d로 구성

from keras.preprocessing.text import Tokenizer
import numpy as np

docs = [
    "너무 재밋어요", "참 최고에요", "참 잘 만든 영화에요", "추천하고 싶은 영화입니다", "한 번 더 보고 싶네요", "글쎄요",
    "별로에요", "생각보다 지루해요", "연기가 어색해요", "재미없어요", "너무 재미없다", "참 재밋네요"
]

# 긍정 1, 부정0
labels = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1])

# 토큰화
token = Tokenizer()
token.fit_on_texts(docs)
print(token.word_index)

x = token.texts_to_sequences(docs)
print('x : ', x)

from keras.preprocessing.sequence import pad_sequences

pad_x = pad_sequences(x, padding='pre')  # ex) 0 이 앞에서 채워짐 0 0 0 3 7
print('pad_x :', pad_x)  # (12, 5)
pad_x = pad_x.reshape(12, 5, 1)
print('pad_x :', pad_x)

word_size = len(token.word_index) + 1
print("전체 토큰 사이즈 : ", word_size)  # 25 전체 단어의 갯수
Ejemplo n.º 51
0
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
Ejemplo n.º 52
0
batch_size = 64
embedding_dims = 50
epochs = 100

print('Loading data...')

with open("./dialog_seg.pkl", "rb") as f:
    dialog = pickle.load(f)
f.close()

with open("./label_index_onehot.pkl", "rb") as f:
    label = pickle.load(f)
f.close()

tokenizer = Tokenizer()
tokenizer.fit_on_texts(dialog)
sequences = tokenizer.texts_to_sequences(dialog)
# print(sequences)
# exit()
x_train, x_test, y_train, y_test = train_test_split(sequences,
                                                    label,
                                                    test_size=0.3333,
                                                    random_state=42)
# (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(
    np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(
    np.mean(list(map(len, x_test)), dtype=int)))
Ejemplo n.º 53
0
y = np.array(list(y))

# create empty list for X
X = []
reviews = list(data['reviews'])
# copy all the reviews into X
for review in reviews:
    X.append(review)

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
""" Text Processing """
# tokenize words
# create embedded layers - convert sentences to number
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

train_embedded = tokenizer.texts_to_sequences(X_train)
test_embedded = tokenizer.texts_to_sequences(X_test)

# the corpus contains 15163 unique words
vocab_size = len(tokenizer.word_index) + 1

## make all the sentences of uniform size - the length of longest sentences
# find out the longest sentences and get the length - length should be uniformed in training and testing dataset - use longest length
longest_sent = max(X_train, key=lambda sent: len(nltk.word_tokenize(sent)))
len_longest = len(nltk.word_tokenize(longest_sent))

# increase the length of sentences by padding
padding_sent = pad_sequences(train_embedded, len_longest, padding="post")
padding_sent_test = pad_sequences(test_embedded, len_longest, padding="post")
raw_text = open(file, 'r').read()
raw_text = [line.strip() for line in raw_text.split('\n')]
raw_text = ' '.join(raw_text)
clean_text = re.sub("[^a-zA-Z]", " ", raw_text)
clean_text = clean_text.lower()
words = clean_text.split()

text_sequences = []
next_word = []
for i in range(0, len(words) - maxlen, step):
    text_sequences.append(' '.join(words[i:i + maxlen]))
    next_word.append(words[i + maxlen])
print('nb sequences:', len(text_sequences))

tokenizer = Tokenizer(malower=True, split=' ')
tokenizer.fit_on_texts(words)
print(tokenizer.word_counts)
print(tokenizer.word_index)
vocab_size = len(tokenizer.word_index) + 1

train_sequences = tokenizer.texts_to_sequences(text_sequences)
X_train = np.array(train_sequences)
target = tokenizer.texts_to_sequences(next_word)
y_train = np_utils.to_categorical(target, vocab_size)


def loadGloveWordEmbeddings(glove_file):
    embedding_vectors = {}
    f = open(glove_file, encoding='utf8')
    for line in f:
        values = line.split()
Ejemplo n.º 55
0
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.15
batch_size = 100
n_epoch = 1

path = r'C:\Users\zhyzhang\Desktop\News Samples\trainingandtestdata\training.1600000.processed.noemoticon.csv'
df = pd.read_csv(path,
                 index_col=None,
                 header=None,
                 engine='python',
                 encoding=None)
df = df.sample(frac=1).reset_index(drop=True)
news = list(df.iloc[:, 5])
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(news)
sequences = tokenizer.texts_to_sequences(news)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    try:
        embedding_vector = google_model[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        continue
Ejemplo n.º 56
0
list_classes = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]
y = train_df[list_classes].values
list_sentences_test = test_df["comment_text"].fillna("NA").values

comments = []
for text in list_sentences_train:
    comments.append(text_to_wordlist(text))

test_comments = []
for text in list_sentences_test:
    test_comments.append(text_to_wordlist(text))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(comments + test_comments)

sequences = tokenizer.texts_to_sequences(comments)
test_sequences = tokenizer.texts_to_sequences(test_comments)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test_data tensor:', test_data.shape)

########################################
    def process_request(self, entityname: str, model_id: int):
        """
        Predicts intent or outcome for the entity
        :param entityname: Entity name
        :param model_id: Primary key ID of record to be inserted
        :return: Output value based on success or failure
        """
        table = str.maketrans('', '', string.punctuation.replace('&', ''))
        entityname = entityname.translate(table)
        print('Entity being searched : {0}', entityname)
        entityList = []
        entityList.append(entityname)
        entityArr = re.split('\W+', entityname)
        if len(entityArr) == 2:
            entityList.append(f'{entityArr[1]} {entityArr[0]}')

        print(entityList)
        er = EventRegistry(apiKey="f4a005ab-a24f-487e-bff4-f39b1b2ba6c2")
        cq = ComplexArticleQuery(query=CombinedQuery.AND([
            BaseQuery(
                keyword=QueryItems.OR(entityList),
                # sourceLocationUri=er.getLocationUri("United States"),
                lang="eng",
                dateStart=date.today() - timedelta(days=365),
                dateEnd=date.today()),
            BaseQuery(keyword=QueryItems.OR([
                "sanction", "bribery", "laundering", "corruption", "blacklist",
                "crime", "scam", "fraud"
            ]))
            # "drugs","trafficking","gambling","illegal","smuggling","terrorism",
            # "extortion","forgery","tax evasion","SDN","burglary","robbery","murder"]))
        ]))
        q = QueryArticles.initWithComplexQuery(cq)
        q.setRequestedResult(
            RequestArticlesInfo(page=1,
                                count=self.news_fetch_count,
                                sortBy="date",
                                sortByAsc=False,
                                returnInfo=ReturnInfo()))
        res = er.execQuery(q)

        # sql_db_path = cfg.read_config('sql_db_path')

        # Remove similar redundant news articles
        # article_list = []
        # match_list = []
        # for article1 in res['articles']['results']:
        #     similarity_flag = False
        #     for article2 in res['articles']['results']:
        #         val = SequenceMatcher(a=article1['body'], b=article2['body']).ratio()
        #         match_list.append(val)
        #         if article1 != article2 and val > 0.8:
        #             similarity_flag = True
        #             print(val)
        #     print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
        #     if not similarity_flag:
        #         article_list.append(article1)
        #
        # print(match_list)
        articles = []
        print('Number of articles found: {0}',
              str(len(res['articles']['results'])))
        for article in res['articles']['results']:
            content = article["body"]
            # print(content.encode("utf-8"))
            title = article["title"]
            url = article["url"]
            articleDateTime = article["dateTime"].replace('T', ' ', 1).replace(
                'Z', '', 1)
            # Replace name of the entity with word ENTITY
            if entityname.lower() in content.lower():
                re_replace = re.compile(re.escape(entityname), re.IGNORECASE)
                content = re_replace.sub('ENTITY', content)
                replaced_entityname = entityname
            if len(entityArr) == 2:
                entity_temp = f'{entityArr[1]}" "{entityArr[0]}'
                if entity_temp.lower() in content.lower():
                    re_replace = re.compile(re.escape(entity_temp),
                                            re.IGNORECASE)
                    content = re_replace.sub('ENTITY', content)
                    replaced_entityname = entity_temp
            if 'ENTITY' in content:
                lst = [title, content, articleDateTime, url]
                articles.append(lst)

        # obtain at most 10 newest articles or blog posts
        X = []
        print('remaining length : {0}', str(len(articles)))
        for article in articles:
            content = article[1]
            articleDateTime = article[2]
            url = article[3]
            # print(url)
            # Insert article into database
            training_model = TrainingModel(ArticleText=content,
                                           TrainingDate=articleDateTime,
                                           SearchModel_id=str(model_id),
                                           IsTrained=0,
                                           Url=url)
            training_model.save()

            # print('---------------------------------Article Body---------------------------------')
            # print(content.encode("utf-8"))
            # print('---------------------------------Tokens---------------------------------')
            # tokens = self.unique_list(self.clean_article(content))
            tokens = self.clean_article(content)

            # Slice out max length characters from the article
            if len(tokens) > self.max_length:
                tokens = tokens[:self.max_length]

            # Grab Before and After 2 sentences of sentence with ENTITY word
            sentences = sent_tokenize(" ".join(tokens))
            indices = [
                idx for idx, sent in enumerate(sentences) if 'ENTITY' in sent
            ]
            # print(indices)
            extended_indices = []
            for i, sentence in enumerate(sentences):
                extended_indices.extend(
                    list(
                        set([
                            i for index in indices
                            if abs(index - i) <= self.sentence_buffer
                        ])))
            # print(extended_indices)
            # print(len(sentences))

            desired_list = list(itemgetter(*extended_indices)(sentences))

            token_sentence = " ".join(desired_list)
            # print(token_sentence)
            X.append(token_sentence)

        # Before prediction
        K.clear_session()

        if path.exists(f'{self.model_path}trained_model.h5'):
            if len(X) > 0:
                # Load the model
                model = load_model(f'{self.model_path}trained_model.h5')

                # prepare tokenizer
                t = Tokenizer()
                t.fit_on_texts(X)

                # integer encode the documents
                encoded_docs = t.texts_to_sequences(X)
                # print(encoded_docs)

                # pad documents to a max length of words
                # max_length = max([len(word.split()) for word in X])
                padded_docs = pad_sequences(encoded_docs,
                                            maxlen=self.max_length,
                                            padding='post')
                # print(padded_docs)

                # Predict on searched articles
                probabilities = model.predict(x=padded_docs,
                                              batch_size=5,
                                              verbose=2)
                # classes = model.predict_classes(x=padded_docs, batch_size=5, verbose=2)
                classes = binarize(probabilities, 0.6)
                # print(classes)
                print(probabilities)

                prediction = []
                for idx in range(len(classes)):
                    prediction.append((int(classes[idx][0]),
                                       round(probabilities[idx][0] * 100, 2)))

                # Sort in descending order of probability
                prediction = sorted(prediction,
                                    key=lambda x: x[1],
                                    reverse=True)

                print(prediction)
                # After prediction
                K.clear_session()

                # Replace ENTITY word with original entity name
                for article in articles:
                    re_replace = re.compile(re.escape('ENTITY'), re.IGNORECASE)
                    article[1] = re_replace.sub(replaced_entityname,
                                                article[1])
                # print(articles)

                return self.OutputParams(True, prediction, articles)
            else:
                return self.OutputParams(False, "", "")
        else:
            raise Exception('Model is not yet ready to predict')
Ejemplo n.º 58
0
    df = pd.read_pickle("combined_data.pkl")
    # df = df[:5000]
    # df["text"] = df.text.str.join(" ")

    # Unpack column by column into an num_review-by-num_metacategories matrix again
    target_vecs = np.vstack(
        [df["cat_{}".format(i)] for i in range(num_metacategories)]).T

    max_words = 2000
    max_len = 10
    X = df.text
    Y = target_vecs
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.40)
    tok = Tokenizer(num_words=max_words)
    tok.fit_on_texts(X_train)
    sequences = tok.texts_to_sequences(X_train)
    sequences_matrix = sequence.pad_sequences(sequences, maxlen=max_len)
    print(sorted(tok.word_counts.items(), key=lambda x: -x[1])[:max_words])

    model = get_model(max_words, max_len)

    model.fit(sequences_matrix,
              Y_train,
              batch_size=512,
              epochs=15,
              validation_split=0.2,
              callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)])
    test_sequences = tok.texts_to_sequences(X_test)
    test_sequences_matrix = sequence.pad_sequences(test_sequences,
                                                   maxlen=max_len)
Ejemplo n.º 59
0
print("모델이 비속어 처리중...")
list_sentences_train = train["comment_text"].fillna(
    "_na_").values  # comment_text만 가져와서 fillna를 통해 nan를 거른다.
# Just import comment_text and filter nan through fillna.
list_classes = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]  # 사용할 컬럼들(to use columns)
y = train[list_classes].values  # labels of comment_text
list_sentences_test = test["comment_text"].fillna(
    "_na_").values  # Do the same things for test_data

tokenizer = Tokenizer(
    num_words=max_features)  # max_features 만큼의 단어를 Tokenize하기 위한 틀 생성.
# Create a frame to Tokenize words as many as max_features.
tokenizer.fit_on_texts(list(list_sentences_train))  # just fit
list_tokenized_train = tokenizer.texts_to_sequences(
    list_sentences_train)  # Tokenize(Transform word into number)
list_tokenized_test = tokenizer.texts_to_sequences(
    list_sentences_test)  # Tokenize(Transform word into number)

X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)  # do the same thing

from keras.models import load_model

model = load_model('toxic_model.h5')

# ## Predict result

y_test = model.predict([X_te], batch_size=1024,
                       verbose=1)  # model에 test data를 넣고 예측
Ejemplo n.º 60
0
# -*- coding: utf-8 -*-
"""
6.3 - Using Keras for word-level one-hot encoding

@author: migue
"""

from keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

# Creates a tokenizer, configured to only take into account the 1,000 most
# common words
tokenizer = Tokenizer(num_words=1000)

# Builds the word index
tokenizer.fit_on_texts(samples)

# Turns strings into lists of integer indices
sequences = tokenizer.texts_to_sequences(samples)

# You could also directly get the one-hot binary representations.
# Vectorization modes other than one-hot encoding are supported by this
# tokenizer.
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')

# How you can recover the word index that was computed
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))