Exemple #1
0
def embed_words(df, embed_dim, embedding_index):
    tokenizer = RegexpTokenizer(r'\w+')
    processed_comments = []
    cachedStopWords = stopwords.words("english")
    for comment in df['commenttext']:
        tokens = tokenizer.tokenize(comment)
        text = ' '.join(
            [word for word in tokens if word not in cachedStopWords])
        #text = tokens
        processed_comments.append(text)

    tokenizer = Tokenizer(num_words=None,
                          filters='!##$%&()*+',
                          lower=True,
                          split=' ')
    tokenizer.fit_on_texts(processed_comments)
    word_index = tokenizer.word_index

    # Prepare embedding matrix
    words_not_found = []
    nb_words = len(word_index) + 1
    embedding_matrix = np.zeros((nb_words, embed_dim))
    for word, i in word_index.items():
        if i >= nb_words:
            continue
        embedding_vector = embedding_index.get(word)
        if (embedding_vector is not None) and len(embedding_vector) > 0:
            embedding_matrix[i] = embedding_vector[:embed_dim]
        else:
            words_not_found.append(word)
    #print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
    return (embedding_matrix, nb_words, tokenizer)
Exemple #2
0
def english_tokenizer(docs, MAX_NB_WORDS, max_seq_len):
    # set stop words
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))
    stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])

    # pre-processing train data
    print("pre-processing train data...")
    processed_docs = []
    for doc in docs:
        tokens = tokenizer.tokenize(doc)
        filtered = [word for word in tokens if word not in stop_words]
        processed_docs.append(" ".join(filtered))

    # tokenizing input data
    print("tokenizing input data...")
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
    tokenizer.fit_on_texts(processed_docs)  # leaky
    word_seq = tokenizer.texts_to_sequences(processed_docs)
    word_index = tokenizer.word_index
    print("dictionary size: ", len(word_index))

    word_seq = sequence.pad_sequences(word_seq, maxlen=max_seq_len)

    return word_seq, word_index
def tokenize_input_data(processed_docs_train, processed_docs_test):
    tokenizer = RegexpTokenizer(r'\w+')
    print("tokenizing input data...")
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
    tokenizer.fit_on_texts(processed_docs_train + processed_docs_test)  #leaky
    word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)
    word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)
    word_index = tokenizer.word_index
    print("dictionary size: ", len(word_index))

    return word_index, word_seq_train, word_seq_test
def preprocess(dataset, args, train=True):
    """

    :param dataset: pandas dataframe
    :param args: config vars dict
    :param train: if True serializes the tokenizer to use in test data, else deserializes
    :return: tuple of (x, y) array | x = (text,  set)
    """
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))
    stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])

    X = dataset['text']
    X_set = dataset['set']
    y = dataset['score']

    #  preprocess - filter stopwords
    print("pre-processing input data...")

    raw_docs = X.tolist()
    processed_docs = []
    for doc in tqdm(raw_docs):
        tokens = tokenizer.tokenize(doc)
        filtered = [word for word in tokens if word not in stop_words]
        processed_docs.append(" ".join(filtered))

    # tokenize for keras
    print("Tokenizing input data...")
    path = args['save_folder'] + 'tokenizer.joblib'
    if train:
        tokenizer = Tokenizer(num_words=args['nb_words'], lower=True, char_level=False)
        tokenizer.fit_on_texts(processed_docs)
        word_index = tokenizer.word_index
        print("vocabulary size: ", len(word_index))

        # save tokenizer
        joblib.dump(tokenizer, path)
        print('Saved tokenizer')
    else:
        tokenizer = joblib.load(path)
        print('Restored tokenizer')


    word_seq = tokenizer.texts_to_sequences(processed_docs)
    word_seq = sequence.pad_sequences(word_seq, maxlen=args['max_seq_len'], padding='post',
                                      truncating='post')


    return [word_seq, X_set], y
Exemple #5
0
def preprocess_data(docs):
    tokenizer = RegexpTokenizer(r'\w+')
    MAX_NB_WORDS = 100000
    max_seq_len = 40
    print("pre-processing train data...")
    processed_docs = []
    print("tokenizing input data...")
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
    tokenizer.fit_on_texts(processed_docs)  # leaky
    word_seq = tokenizer.texts_to_sequences(processed_docs)
    word_index = tokenizer.word_index
    print("dictionary size: ", len(word_index))

    # pad sequences
    word_seq = sequence.pad_sequences(word_seq, maxlen=max_seq_len)
    return word_seq, word_index
def tweetAnalysis(tweets, stop_words, tokenizer, embeddings_index,
                  MAX_NB_WORDS):
    test_df = tweets
    test_df = test_df.fillna('_NA_')
    label_names = [
        "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
    ]
    raw_docs_test = test_df['cleaned_tweet'].tolist()
    #     raw_docs_test = [tweets,]
    num_classes = len(label_names)
    tokenizer = RegexpTokenizer(r'\w+')
    processed_docs_test = []
    for doc in tqdm(raw_docs_test):
        tokens = tokenizer.tokenize(doc)
        filtered = [word for word in tokens if word not in stop_words]
        processed_docs_test.append(" ".join(filtered))

#end for

    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
    tokenizer.fit_on_texts(processed_docs_test)  #leaky
    word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)
    word_index = tokenizer.word_index

    #pad sequences
    word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=168)

    #embedding matrix
    words_not_found = []
    nb_words = min(MAX_NB_WORDS, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_dim))
    for word, i in word_index.items():
        if i >= nb_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if (embedding_vector is not None) and len(embedding_vector) > 0:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
        else:
            words_not_found.append(word)
    print('number of null word embeddings: %d' %
          np.sum(np.sum(embedding_matrix, axis=1) == 0))
    return word_seq_test
Exemple #7
0
def tokenize_input(df):
    # We feed a comment into a trained CNN model to obtain the feature vector X in R(mX1) of this comment in the fully connected layer. Each feature x_i in the feature vector X corresponds to a filter.
    tokenizer = RegexpTokenizer(r'\w+')
    processed_comments = []
    cachedStopWords = stopwords.words("english")
    for comment in df['commenttext']:
        tokens = tokenizer.tokenize(comment)
        text = ' '.join(
            [word for word in tokens if word not in cachedStopWords])
        processed_comments.append(text)
    tokenizer = Tokenizer(num_words=None,
                          filters='!##$%&()*+',
                          lower=True,
                          split=' ')
    tokenizer.fit_on_texts(processed_comments)
    X = tokenizer.texts_to_sequences(processed_comments)
    X = pad_sequences(X, 1000)
    # Create reverse word map
    word_index = tokenizer.word_index
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    return X, reverse_word_map
Exemple #8
0
def my_form_post():
    text = request.form['text']
    processed_text = text.upper()
    tokenizer = RegexpTokenizer(r'\w+')
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
    tokenizer.fit_on_texts(processed_docs_train + processed_docs_test)
    s = processed_text
    arr = []
    arr.append(s)
    s = tokenizer.texts_to_sequences(arr)
    s = sequence.pad_sequences(s, maxlen=max_seq_len)
    res = model.predict(s)
    final = []
    for i in range(len(res)):
        for j in range(len(res[i])):
            final.append(str(int(res[i][j] * 100)) + "%")
    l = [[
        "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
    ]]

    ans = pandas.DataFrame(final, l)
    var1 = ans.var()
    return str(ans)
processed_train_data = []
for doc in tqdm(raw_docs_train):
    tokens = tokenizer.tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    processed_train_data.append(" ".join(filtered))

print('pre-processing test data')
processed_test_data = []
for doc in tqdm(raw_docs_test):
    tokens = tokenizer.tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    processed_test_data.append(" ".join(filtered))

print("Tokenizing input data....")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
tokenizer.fit_on_texts(processed_train_data + processed_test_data)
word_seq_train = tokenizer.texts_to_sequences(processed_train_data)
word_seq_test = tokenizer.texts_to_sequences(processed_test_data)
word_index = tokenizer.word_index
print("Dictionary Size: " + str(len(word_index)))

# Pad Sequences
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len)
batch_size = 256
num_epochs = 8

num_filters = 64
embed_dim = 300
weight_decay = 1e-4
words_not_found = []
Exemple #10
0
# generate the training and testing data
embeddings = get_word2vec_embeddings(word2vec, clean_text)

list_corpus = clean_text["text"].tolist()
list_labels = clean_text["term_selected"].tolist()

X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(
    embeddings, list_labels, test_size=0.2, random_state=40)

EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 46
VOCAB_SIZE = len(VOCAB)

VALIDATION_SPLIT = 0.2
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(clean_text["text"].tolist())
sequences = tokenizer.texts_to_sequences(clean_text["text"].tolist())

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

cnn_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = (np.asarray(clean_text["term_selected"]))

indices = np.arange(cnn_data.shape[0])
np.random.shuffle(indices)
cnn_data = cnn_data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * cnn_data.shape[0])

embedding_weights = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
Exemple #11
0
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string)  
    return string.strip() 

myTexts=[]
for each in X:
	myEach=clean_string(each)
	myTexts +=[myEach]
	
# tokenize texts into tokens 
tokenizer = Tokenizer(nb_words=800)
tokenizer.fit_on_texts(myTexts)
sequences = tokenizer.texts_to_sequences(myTexts)
word_index = tokenizer.word_index
# trim the length of each sequence to the same length, I set 300.
data = pad_sequences(sequences, maxlen=300)
y = np.zeros((len(myTexts), 1))

for i in range(len(myTexts)):
	if i < 1000:
		y[i]=[True] # positive
	else:
		y[i]=[False] # negative

embedding_matrix = np.zeros((len(word_index) + 1, 50))
for word, i in word_index.items():
	embedding_vector = myDictionary.get(key)
Exemple #12
0
 maxlen = 150  # max number of words in a comment to use
 
 stop_words = set(stopwords.words('english'))     
  
 tokenizer = RegexpTokenizer(r'[a-zA-Z]+')   
 
 ####################################################
 # DATA PREPARATION
 ####################################################  
 
 X_train = train["comment_text"].fillna("fillna").values
 y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
 X_test = test["comment_text"].fillna("fillna").values
 
 tokenizer = text.Tokenizer(num_words=max_features)
 tokenizer.fit_on_texts(list(X_train) + list(X_test))
 X_train = tokenizer.texts_to_sequences(X_train)
 X_test = tokenizer.texts_to_sequences(X_test)
 x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
 x_test = sequence.pad_sequences(X_test, maxlen=maxlen)
 '''
 # TRAIN
 train["comment_text"].fillna('_NA_')
 train = standardize_text(train, "comment_text")
 train["tokens"] = train["comment_text"].apply(tokenizer.tokenize)
 # delete Stop Words
 train["tokens"] = train["tokens"].apply(lambda vec: [word for word in vec if word not in stop_words])
 # Normalize Bad Words    
 train["tokens"] = train["tokens"].apply(lambda vec: normalize_bad_word(vec, bad_words))
 #train.to_csv(base_path_output + 'train_normalized.csv', index=False)
 
Exemple #13
0
	print("Max sentence length is %s" % max(test_sentence_lengths))
	print("Min sentence length is %s" % min(test_sentence_lengths))
	print("Mean sentence length is %s" % mean(test_sentence_lengths))
	
	test["tokens"] = test["tokens"].apply(lambda vec :' '.join(vec))
	print("num test: ", test.shape[0])
	print(test.head())
	
	# Turn each comment into a list of word indexes of equal length (with truncation or padding as needed)
	list_sentences_train = train["tokens"].values
	list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
	y = train[list_classes].values
	list_sentences_test = test["tokens"].values
	
	tokenizer = Tokenizer(num_words=max_features)
	tokenizer.fit_on_texts(list(list_sentences_train) + list(list_sentences_test))
	list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
	list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
	X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
	X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)


	print(X_t.shape) #(159571, 150)
	
	# BUILD EMBEDDING MATRIX    
	print('Preparing embedding matrix...')
	# Read the FastText word vectors (space delimited strings) into a dictionary from word->vector
	embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))
	print("embeddings_index size: ", len(embeddings_index))

	
bottom_scores = sorted_contributions['Good']['detractors'][:10].tolist()

plot_important_words(top_scores, top_words, bottom_scores, bottom_words,
                     "Most important words for relevance")

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 150
VOCAB_SIZE = len(VOCAB)

VALIDATION_SPLIT = .3
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(data2["review-text"].tolist())
sequences = tokenizer.texts_to_sequences(data2["review-text"].tolist())

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

cnn_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(data2["IsGood"]))

indices = np.arange(cnn_data.shape[0])
np.random.shuffle(indices)
cnn_data = cnn_data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * cnn_data.shape[0])

embedding_weights = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
Exemple #15
0
print("pre-processing statuses...")
processed_data_train = []
processed_data_test = []

for data in tqdm(raw_data_train):
    tokens = tokenizer.tokenize(data)
    filtered = [word for word in tokens if word not in stop_words]
    processed_data_train.append(" ".join(filtered))
for data in tqdm(raw_data_test):
    tokens = tokenizer.tokenize(data)
    filtered = [word for word in tokens if word not in stop_words]
    processed_data_test.append(" ".join(filtered))

print("tokenizing input data...")
tokenizer = Tokenizer(num_words=Max_No_Words, lower=True, char_level=False)
tokenizer.fit_on_texts(processed_data_train + processed_data_test)
word_seq_train = tokenizer.texts_to_sequences(processed_data_train)
word_seq_test = tokenizer.texts_to_sequences(processed_data_test)
word_index = tokenizer.word_index
print("Dictionary size = ", len(word_index))

#pad sequences
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=Max_Sent_Len)
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=Max_Sent_Len)

print("Done !!")
''' Embedding Words '''
embed_dim = 300

print('loading and processing word embeddings...')
EMBEDDING_FILE = 'wiki-news-300d-1M.vec'
# word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = max(sentence_lengths) + 1
VOCAB_SIZE = len(VOCAB)
NUM_CLASSES = 2

label2emotion = {0: 'humor', 1: 'NotHumor'}
emotion2label = {'humor': 0, 'NotHumor': 1}

# label2emotion = {0:"sad", 1:"disgust", 2: "fear", 3:"angry", 4:"surprise", 5:"joy"}
# emotion2label = {"sad":0, "disgust":1, "fear":2, "angry":3, "surprise":4, "joy":5}

VALIDATION_SPLIT = .2
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(clean_questions["text"].astype(str).tolist())
# tokenizer.fit_on_texts(trial_data["comment_text"].astype(str).tolist())
# tokenizer.fit_on_texts(test_data["comment_text"].astype(str).tolist())
sequences_train = tokenizer.texts_to_sequences(
    clean_questions["text"].astype(str).tolist())  #转换成列表
# sequences_trial = tokenizer.texts_to_sequences(trial_data["comment_text"].astype(str).tolist())
# sequences_test = tokenizer.texts_to_sequences(test_data["comment_text"].astype(str).tolist())

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))  #训练样例总数

cnn_data = pad_sequences(sequences_train,
                         maxlen=MAX_SEQUENCE_LENGTH)  #keras只能接受长度相同的序列输入。
# 因此如果目前序列长度参差不齐,这时需要使用pad_sequences()。
# 该函数是将序列转化为经过填充以后的一个长度相同的新序列新序列。
# cnn_data_trial = pad_sequences(sequences_trial, maxlen=MAX_SEQUENCE_LENGTH)
def mymain():
    """CONFIGURATION"""

    # consts

    DATA_DIR = "C:\\Users\\T149900\\ml_mercari\\"
    WORD_COUNT_MEAN_PLUSS_STD = 1
    WORD_COUNT_MEAN_THIRD = 2

    num_words = 100000

    batch_size = 256
    num_epochs = 8

    num_splits = 5

    embed_dim = 300

    word_count_strategy = WORD_COUNT_MEAN_PLUSS_STD
    word_database = "toxic\\wiki.simple.vec"

    # CV accuracy is 0.987488975854 +/- 0.00889961690531
    # CV accuracy is 0.988607054794 +/- 0.00992068287633

    sns.set_style("whitegrid")
    np.random.seed(0)

    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))
    stop_words.update(
        ['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])

    embeddings_index = get_word_embeddings(DATA_DIR + word_database)

    train_df = pd.read_csv(DATA_DIR + "toxic\\train.csv")

    print("num train: ", train_df.shape[0])

    label_names = [
        "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
    ]

    y_train = train_df[label_names].values

    print(y_train.shape)

    train_df['doc_len'] = train_df['comment_text'].apply(
        lambda words: len(words.split(" ")))

    max_seq_len = 0

    if word_count_strategy == WORD_COUNT_MEAN_PLUSS_STD:
        max_seq_len = np.round(train_df['doc_len'].mean() +
                               train_df['doc_len'].std()).astype(int)
    elif word_count_strategy == WORD_COUNT_MEAN_THIRD:
        max_seq_len = np.round(train_df['doc_len'].mean() / 3.0).astype(int)

    assert (max_seq_len > 0)

    raw_docs_train = train_df['comment_text'].tolist()

    num_classes = len(label_names)

    print("pre-processing train data...")
    processed_docs_train = []
    for doc in tqdm(raw_docs_train):
        tokens = tokenizer.tokenize(doc)
        filtered = [word for word in tokens if word not in stop_words]
        processed_docs_train.append(" ".join(filtered))
    """end for"""

    tokenizer = keras.preprocessing.text.Tokenizer(num_words=num_words,
                                                   lower=True,
                                                   char_level=False)

    tokenizer.fit_on_texts(processed_docs_train)  #non-leaky

    word_index = tokenizer.word_index

    print("dictionary size: ", len(word_index))

    word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)

    word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)

    embedding_matrix = get_embedding_matrix(embeddings_index, num_words)

    model = get_model(num_words, max_seq_len, embedding_matrix, embed_dim,
                      num_classes)

    d = keras_CV(model, word_seq_train, y_train, num_splits, num_epochs,
                 batch_size)

    print("CV accuracy is " + str(d['score']) + " +/- " + str(d['std']))
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, generate_missing=generate_missing))
    return list(embeddings)

training_embeddings = get_word2vec_embeddings(word2vec, clean_train_comments, generate_missing=True)
test_embeddings = get_word2vec_embeddings(word2vec, clean_test_comments, generate_missing=True)

EMBEDDING_DIM = 300 # how big is each word vector
MAX_VOCAB_SIZE = 175303 # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH = 200 # max number of words in a comment to use

#training params
batch_size = 256 
num_epochs = 2

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(clean_train_comments["comment_text"].tolist())
training_sequences = tokenizer.texts_to_sequences(clean_train_comments["comment_text"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

test_sequences = tokenizer.texts_to_sequences(clean_test_comments["comment_text"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
Exemple #19
0
def load_data(train_file,test_file):
    clean_questions = pd.read_csv(train_file)
    clean_test = pd.read_csv(test_file)

    tokenizer = RegexpTokenizer(r'\w+')

    clean_questions["tokens"] = clean_questions['comment_text'].astype(str).apply(tokenizer.tokenize)
    clean_test["tokens"] = clean_test['comment_text'].astype(str).apply(tokenizer.tokenize)

    all_words = [word for tokens in clean_questions["tokens"] for word in tokens]
    for tokens in clean_questions["tokens"]:
        for word in tokens:
            all_words.append(word)
    print(all_words[-1])

    sentence_lengths = [len(tokens) for tokens in clean_questions["tokens"]]
    VOCAB = sorted(list(set(all_words)))
    print("%s words total,with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
    print("Max sentence length is %s" % max(sentence_lengths))
    max_sequence_length = max(sentence_lengths) + 1
    num_words = len(VOCAB)
    VALIDATION_SPLIT = .2

    tokenizer = Tokenizer(num_words)
    tokenizer.fit_on_texts(clean_questions['comment_text'].astype(str).tolist())
    tokenizer.fit_on_texts(clean_test['comment_text'].astype(str).tolist())

    sequences_train = tokenizer.texts_to_sequences(clean_questions['comment_text'].astype(str).tolist())
    sequences_test = tokenizer.texts_to_sequences(clean_test['comment_text'].astype(str).tolist())

    train_data = pad_sequences(sequences_train, maxlen=max_sequence_length)
    test_data = pad_sequences(sequences_test, maxlen=max_sequence_length)

    clean_questions['to_task1'] = [t1[w] for w in clean_questions['task_1']]
    clean_questions['to_task2'] = [t2[w] for w in clean_questions['task_2']]
    train_labelsA = to_categorical(np.array(clean_questions['to_task1']))
    train_labelsB = to_categorical(np.array(clean_questions['to_task2']), 4)
    from collections import Counter
    print('train_labelsB',Counter(clean_questions['to_task1']))
    print('train_labelsB',Counter(clean_questions['to_task2']))

    clean_test['to_task1'] = [t1[w] for w in clean_test['task_1']]
    clean_test['to_task2'] = [t2[w] for w in clean_test['task_2']]
    teat_labelsA = to_categorical(np.array(clean_test['to_task1']))
    teat_labelsB = to_categorical(np.array(clean_test['to_task2']), 4)



    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    indices = range(train_data.shape[0])
    np.random.shuffle(list(indices))
    train_data = train_data[indices]
    train_labelsA = train_labelsA[indices]
    train_labelsB = train_labelsB[indices]
    # num_validation_samples = int(VALIDATION_SPLIT * cnn_data.shape[0])


    train_x,train_ya,train_yb=[],[],[]
    val_x,val_ya,val_yb=[],[],[]
    for i in range(len(train_data)):
        if i % split == 0:
            val_x.append(train_data[i])
            val_ya.append(train_labelsA[i])
            val_yb.append(train_labelsB[i])
        else:
            train_x.append(train_data[i])
            train_ya.append(train_labelsA[i])
            train_yb.append(train_labelsB[i])

    data = [[train_data,train_labelsA,train_labelsB], [test_data, teat_labelsA, teat_labelsB, clean_test['task_3'], clean_test['id']]]
    # data = [[train_x,train_ya,train_yb], [val_x,val_ya,val_yb],[test_data, teat_labelsA, teat_labelsB, clean_test['task_3'], clean_test['id']]]
    return data, word_index,num_words,max_sequence_length
Exemple #20
0
print("%s words total,with a vocabulary size of %s" %
      (len(all_words), len(VOCAB)))
print("Max sentence length is %s" % max(sentence_lengths))

word2vec_path = "/home/baiyang/baiyang/code/crawl-300d-2M.vec"
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path,
                                                           binary=False)

EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = max(sentence_lengths) + 1
VOCAB_SIZE = len(VOCAB)

VALIDATION_SPLIT = 0.2
##############分词开始
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(
    (clean_questions["sent0"] + '<eos>' + clean_questions["sent1"]).tolist())
tokenizer.fit_on_texts(
    (test_data["sent0"] + '<eos>' + test_data["sent1"]).tolist())
sequences_train = tokenizer.texts_to_sequences(
    (clean_questions["sent0"] + '<eos>' + clean_questions["sent1"]).tolist())
sequences_test = tokenizer.texts_to_sequences(
    (test_data["sent0"] + '<eos>' + test_data["sent1"]).tolist())

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

cnn_data = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
cnn_data_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(clean_questions["label"]))
##
test_labels = test_data["label"]
Exemple #21
0
## **1. Convert NLTK token with sklearn token**
"""

import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

X.apply(lambda el: " ".join(el))

X = X.values

X = np.array(X, dtype='O')
  
tokenizer = Tokenizer(num_words=5000, oov_token='oov')
tokenizer.fit_on_texts(X) 
  
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=600, padding='post', truncating='post')

print(X[0])

# y = pd.get_dummies(y)
y = np.array(pd.get_dummies(y).values, dtype='O')

# Split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=63)

print(X[0])
df['doc_len'] = df['tweet'].apply(
    lambda words: len(words.split(" ")))  #length of each tweet

max_seq_len = np.round(df['doc_len'].mean() + df['doc_len'].std()).astype(int)

raw_docs = df['tweet'].tolist()

#processed docs generator
processed_docs = []
for doc in tqdm(raw_docs):
    tokens = tokenizer.tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    processed_docs.append(" ".join(filtered))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
tokenizer.fit_on_texts(processed_docs)
word_seq = tokenizer.texts_to_sequences(
    processed_docs
)  #each tweet gets tokenized---word_seq is a list of tokenized tweets(length = 3059)

word_seq = sequence.pad_sequences(word_seq, maxlen=max_seq_len)

word_index = tokenizer.word_index  #dictionary of words in tweets and their associated id
inverted_word_index = dict(
    (v, k) for k, v in word_index.iteritems()
)  #used for deriving word from an index(used to deal with padded sequences)

embed_dim = 300

model_bin = "cc.en.300.bin"  #fasttext model
model = fasttext.load_model(model_bin)  #loading fasttext model
Exemple #23
0
for doc in tqdm(raw_docs_train):
    tokens = tokenizer.tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    processed_docs_train.append(" ".join(filtered))
#end for

processed_docs_test = []
for doc in tqdm(raw_docs_test):
    tokens = tokenizer.tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    processed_docs_test.append(" ".join(filtered))
#end for

print("tokenizing input data...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
tokenizer.fit_on_texts(processed_docs_train + processed_docs_test)  #leaky
word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)
word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)
word_index = tokenizer.word_index
print("dictionary size: ", len(word_index))

#pad sequences
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len)

#training params
batch_size = 128
num_epochs = 2
embed_dim = 300

#embedding matrix
Exemple #24
0
        lambda words: len(words.split(" ")))
    max_seq_len = np.round(train_df['doc_len'].mean() +
                           train_df['doc_len'].std()).astype(int)

    processed_comments_train = preprocess_df(train_df,
                                             tokenizer=tokenizer,
                                             stop_words=stop_words)
    processed_comments_val = preprocess_df(val_df,
                                           tokenizer=tokenizer,
                                           stop_words=stop_words)
    processed_comments_test = preprocess_df(test_df,
                                            tokenizer=tokenizer,
                                            stop_words=stop_words)

    tokenizer = Tokenizer(num_words=max_nb_words, lower=True, char_level=False)
    tokenizer.fit_on_texts(processed_comments_train + processed_comments_val +
                           processed_comments_test)
    X_train = tokenizer.texts_to_sequences(processed_comments_train)
    X_val = tokenizer.texts_to_sequences(processed_comments_val)
    X_test = tokenizer.texts_to_sequences(processed_comments_test)
    word_index = tokenizer.word_index

    #pad sequences
    X_train = sequence.pad_sequences(X_train, maxlen=max_seq_len)
    print(X_train)
    print(X_train.shape)
    X_val = sequence.pad_sequences(X_val, maxlen=max_seq_len)
    X_test = sequence.pad_sequences(X_test, maxlen=max_seq_len)

    # TODO move everything to config file
    #training params
    batch_size = 256
Exemple #25
0
def load_data_f(train_file, test_file):
    clean_questions = pd.read_csv(train_file)
    clean_test = pd.read_csv(test_file)

    tokenizer = RegexpTokenizer(r'\w+')

    clean_questions["tokens"] = clean_questions['comment_text'].astype(
        str).apply(tokenizer.tokenize)
    clean_test["tokens"] = clean_test['comment_text'].astype(str).apply(
        tokenizer.tokenize)

    all_words = [
        word for tokens in clean_questions["tokens"] for word in tokens
    ]
    for tokens in clean_questions["tokens"]:
        for word in tokens:
            all_words.append(word)
    print(all_words[-1])

    sentence_lengths = [len(tokens) for tokens in clean_questions["tokens"]]
    VOCAB = sorted(list(set(all_words)))
    print("%s words total,with a vocabulary size of %s" %
          (len(all_words), len(VOCAB)))
    print("Max sentence length is %s" % max(sentence_lengths))
    max_sequence_length = max(sentence_lengths) + 1
    num_words = len(VOCAB)

    tokenizer = Tokenizer(num_words)
    tokenizer.fit_on_texts(
        clean_questions['comment_text'].astype(str).tolist())
    tokenizer.fit_on_texts(clean_test['comment_text'].astype(str).tolist())

    sequences_train = tokenizer.texts_to_sequences(
        clean_questions['comment_text'].astype(str).tolist())
    sequences_test = tokenizer.texts_to_sequences(
        clean_test['comment_text'].astype(str).tolist())

    train_data = pad_sequences(sequences_train, maxlen=max_sequence_length)
    test_data = pad_sequences(sequences_test, maxlen=max_sequence_length)

    clean_questions['to_task1'] = [t1[w] for w in clean_questions['task_1']]
    clean_questions['to_task2'] = [t2[w] for w in clean_questions['task_2']]
    train_labelsA = to_categorical(np.array(clean_questions['to_task1']))
    train_labelsB = to_categorical(np.array(clean_questions['to_task2']), 4)

    clean_test['to_task1'] = [t1[w] for w in clean_test['task_1']]
    clean_test['to_task2'] = [t2[w] for w in clean_test['task_2']]
    teat_labelsA = to_categorical(np.array(clean_test['to_task1']))
    teat_labelsB = to_categorical(np.array(clean_test['to_task2']), 4)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    indices = range(train_data.shape[0])
    np.random.shuffle(list(indices))
    train_data = train_data[indices]
    train_labelsA = train_labelsA[indices]
    train_labelsB = train_labelsB[indices]

    data = [[train_data, train_labelsA, train_labelsB],
            [
                test_data, teat_labelsA, teat_labelsB, clean_test['task_3'],
                clean_test['id']
            ]]
    return data, word_index, max_sequence_length
"""## Neural Networks"""

# tokenize to sequence, padding
# get list of all words
total_word = []
max_len = 0
for i in X_train["words_s"].tolist():
    max_len = max(max_len, len(i))
    for j in i:
        total_word.append(j)
total_word = list(set(total_word))

# tokenize to sequences
tokenizer = Tokenizer(num_words = len(total_word))
tokenizer.fit_on_texts(X_train["processed_s"])
train_sequences = tokenizer.texts_to_sequences(X_train["processed_s"])
test_sequences = tokenizer.texts_to_sequences(X_test["processed_s"])

# padding
padded_train_s = pad_sequences(train_sequences, maxlen = max_len, padding = 'post', truncating = 'post')
padded_test_s = pad_sequences(test_sequences, maxlen = max_len, padding = 'post', truncating = 'post') 
print("The padded encoding for document\n",X_train["processed_s"][0],"\n is : ",padded_train_s[0])

"""RNN"""

# Sequential Model
model = Sequential()
model.add(Embedding(len(total_word), output_dim = 150))
model.add(Bidirectional(SimpleRNN(150)))
EMBEDDINGS_PATH = './Models/SBW-vectors-300-min5.bin'
embedding_model = KeyedVectors.load_word2vec_format(EMBEDDINGS_PATH,
                                                    binary=True)

# Obtención de datos para entrenamiento y prueba #
kf = StratifiedKFold(n_splits=5, random_state=42)
fold = 1
max_len = 300
for train_index, test_index in kf.split(texts, labels):
    print("{}-fold".format(fold))
    texts_train = [texts[index] for index in train_index]
    texts_test = [texts[index] for index in test_index]
    y = pd.get_dummies(labels).values
    y_train, y_test = y[train_index], y[test_index]
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts_train)
    x_train = tokenizer.texts_to_sequences(texts_train)
    x_test = tokenizer.texts_to_sequences(texts_test)
    x_train = pad_sequences(x_train, maxlen=max_len)
    x_test = pad_sequences(x_test, maxlen=max_len)

    #Guardado de pesos word2vec#
    input_dim = len(tokenizer.word_index)
    embedding_matrix = np.zeros((input_dim + 1, 300))
    for word, i in tokenizer.word_index.items():
        if word in embedding_model:
            embedding_matrix[i] = embedding_model[word]
    np.savez_compressed(
        './Data/K-Folds/{}-fold/embedding_matrix_baseline.npz'.format(fold),
        embedding_matrix,
    )
Exemple #28
0
        x, vectors, generate_missing=generate_missing))
    return list(embeddings)


training_embeddings = get_word2vec_embeddings(word2vec,
                                              train_comments,
                                              generate_missing=True)
test_embeddings = get_word2vec_embeddings(word2vec,
                                          test_comments,
                                          generate_missing=True)

MAX_VOCAB_SIZE = 175303
embedding_vecor_length = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(train_comments["comment_message"].tolist())
#Transform each sentence in a sequence of integers
training_sequences = tokenizer.texts_to_sequences(
    train_comments["comment_message"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

from keras.preprocessing.sequence import pad_sequences

#Fill or trancate with data
train_cnn_data = pad_sequences(training_sequences, maxlen=max_review_length)

train_embedding_weights = np.zeros(
    (len(train_word_index) + 1, embedding_vecor_length))
for word, index in train_word_index.items():
Exemple #29
0
def train(address):

    #DATA_PATH = '/Users/wangergou/Downloads/kaggle/Toxic_Comment_Classification/CNN_Crawl/data/'
    #EMBEDDING_DIR = '/Users/wangergou/Downloads/kaggle/Toxic_Comment_Classification/CNN_Crawl/data/'

    DATA_PATH = address
    EMBEDDING_DIR = address

    MAX_NB_WORDS = 100000
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))
    stop_words.update(
        ['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])

    print('loading word embeddings...')

    embeddings_index = {}

    f = codecs.open(EMBEDDING_DIR + 'crawl-300d-2M.vec', encoding='utf-8')
    for line in tqdm(f):
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('found %s word vectors' % len(embeddings_index))

    print("loading data...")

    train_df = pd.read_csv(DATA_PATH + 'train.csv', sep=',', header=0)
    test_df = pd.read_csv(DATA_PATH + 'test.csv', sep=',', header=0)
    test_df = test_df.fillna('_NA_')

    print("num train: ", train_df.shape[0])
    print("num test: ", test_df.shape[0])

    label_names = [
        "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
    ]
    y_train = train_df[label_names].values

    raw_docs_train = train_df['comment_text'].tolist()
    raw_docs_test = test_df['comment_text'].tolist()
    num_classes = len(label_names)

    train_df['doc_len'] = train_df['comment_text'].apply(
        lambda words: len(words.split(" ")))
    max_seq_len = np.round(train_df['doc_len'].mean() +
                           train_df['doc_len'].std()).astype(int)

    print("pre-processing train data...")
    processed_docs_train = []
    for doc in tqdm(raw_docs_train):
        tokens = tokenizer.tokenize(doc)
        filtered = [word for word in tokens if word not in stop_words]
        processed_docs_train.append(" ".join(filtered))
    #end for

    processed_docs_test = []
    for doc in tqdm(raw_docs_test):
        tokens = tokenizer.tokenize(doc)
        filtered = [word for word in tokens if word not in stop_words]
        processed_docs_test.append(" ".join(filtered))
    #end for

    print("tokenizing input data...")
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
    tokenizer.fit_on_texts(processed_docs_train + processed_docs_test)  #leaky
    word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)
    word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)
    word_index = tokenizer.word_index
    print("dictionary size: ", len(word_index))

    #pad sequences
    word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)
    word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len)

    #training params
    batch_size = 256
    num_epochs = 8

    #model parameters
    num_filters = 512
    embed_dim = 300
    weight_decay = 1e-4

    #embedding matrix
    print('preparing embedding matrix...')
    words_not_found = []
    nb_words = min(MAX_NB_WORDS, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_dim))
    for word, i in word_index.items():
        if i >= nb_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if (embedding_vector is not None) and len(embedding_vector) > 0:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
        else:
            words_not_found.append(word)
    print('number of null word embeddings: %d' %
          np.sum(np.sum(embedding_matrix, axis=1) == 0))

    #CNN architecture
    print("training CNN ...")
    model = Sequential()
    model.add(
        Embedding(nb_words,
                  embed_dim,
                  weights=[embedding_matrix],
                  input_length=max_seq_len,
                  trainable=False))
    model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.5))
    model.add(
        Dense(32,
              activation='relu',
              kernel_regularizer=regularizers.l2(weight_decay)))
    model.add(Dense(num_classes,
                    activation='sigmoid'))  #multi-label (k-hot encoding)

    adam = optimizers.Adam(lr=0.001,
                           beta_1=0.9,
                           beta_2=0.999,
                           epsilon=1e-08,
                           decay=0.0)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])
    model.summary()

    #define callbacks
    early_stopping = EarlyStopping(monitor='val_loss',
                                   min_delta=0.01,
                                   patience=4,
                                   verbose=1)
    callbacks_list = [early_stopping]

    #model training
    hist = model.fit(word_seq_train,
                     y_train,
                     batch_size=batch_size,
                     epochs=num_epochs,
                     callbacks=callbacks_list,
                     validation_split=0.1,
                     shuffle=True,
                     verbose=2)

    y_test = model.predict(word_seq_test)

    #create a submission
    submission_df = pd.DataFrame(columns=['id'] + label_names)
    submission_df['id'] = test_df['id'].values
    submission_df[label_names] = y_test
    submission_df.to_csv(address + "cnn_fasttext_submission_512.csv",
                         index=False)
Exemple #30
0
MAX_SEQUENCE_LENGTH = 32
MAX_NUM_WORDS = 20000 #max token 19361 something
EMBEDDING_DIM = 300

print("MAXIMUM")
print(max([len(X_train_pre["headline"][i].strip().split(" ")) for i in range(len(X_train_pre))]))
print(max([len(X_valid_pre["headline"][i].strip().split(" ")) for i in range(len(X_valid_pre))]))
print(max([len(X_test_pre["headline"][i].strip().split(" ")) for i in range(len(X_test_pre))]))

#print(X_test_pre["headline"][2])


# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(X_train_pre['headline'].values)
X_train = tokenizer.texts_to_sequences(X_train_pre['headline'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of train data tensor:', X_train.shape)
#print("X_train[0] &&&&&&&&&&&&&&&&&&&&&&&& :",X_train[0])
# print(X_train[1])

X_test = tokenizer.texts_to_sequences(X_test_pre['headline'].values) # Every word got a new number
# print(X_test[1])
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test data tensor:', X_test.shape)
# print(X_test[1])

X_valid = tokenizer.texts_to_sequences(X_valid_pre['headline'].values) # Every word got a new number
Exemple #31
0
    tupleIndex += 1
    words = tuple[0].split()
    wordsList = []
    for word in words:
        wordNF = tokenizeWord(morph.parse(word)[0].normal_form)
        uniqWords.append(wordNF)
        wordsList.append(wordNF)
    testSentences.append(wordsList)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

print(len(uniqWords))

tokenizer = Tokenizer(num_words=len(uniqWords))
tokenizer.fit_on_texts(sentences)

vocab_size = len(tokenizer.word_index) + 1
maxlen = 1000

X_train = tokenizer.texts_to_sequences(sentences)
X_test = tokenizer.texts_to_sequences(testSentences)

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

print(vocab_size)

from keras.utils import to_categorical
y_train = df["label"]
y_test = test["label"]