n=0
colors = cm.get_cmap('tab20')

means = []
comps = []


whitegames = []

for game in gamelist:
    #white = ' '.join(i for i in game.split(' ')[:openinglength*2:2])
    white = ' '.join('white' + i.replace('x','') if j%2==0 else 'black' + i.replace('x','') for j,i in enumerate(game.split(' ')[:40]))
  
    whitegames.append(white)
        
tokeniser = text.Tokenizer(filters='!"#$%&()*+,./:;<>?@[\\]^_`{|}~\t\n')
tokeniser.fit_on_texts(whitegames)

labels = y

idx_word = tokeniser.index_word
idx_word[0] = ''

for openinglength in range(8,42,2):
    n+=1
    whitegames = []
    
    for game in gamelist:
    #white = ' '.join(i for i in game.split(' ')[:openinglength*2:2])
        white = ' '.join('white' + i.replace('x','') if j%2==0 else 'black' + i.replace('x','') for j,i in enumerate(game.split(' ')[:openinglength]))
  
Beispiel #2
0
max_features = 20000
maxlen = 100

train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
train = train.sample(frac=1)

list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_classes = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").values

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)


def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
Beispiel #3
0
def text_preprocess():
    print('read data...')
    train_ori, test_ori = _read_data('train.pd', 'test.pd')

    print('remove patterns...')
    train_ori['text_list'] = train_ori['text_list'].apply(
        lambda list: _remove_pattern_2(list))
    test_ori['text_list'] = test_ori['text_list'].apply(
        lambda list: _remove_pattern_2(list))

    print('shuffle...')
    # train_ori.sample(frac=1).reset_index(drop=True)
    # test_ori.sample(frac=1).reset_index(drop=True)
    train_ori = train_ori.iloc[np.random.permutation(
        len(train_ori))]  # 手动shuffle
    test_ori = test_ori.iloc[np.random.permutation(len(test_ori))]

    print('join text list...')
    train_text = train_ori['text_list'].apply(lambda list: " ".join(list))
    test_text = test_ori['text_list'].apply(lambda list: " ".join(list))
    # train_text = train_ori['text_list']
    # test_text = test_ori['text_list']

    print('prepare labels...')
    Y_train = train_ori['label'].apply(lambda gender: 1
                                       if gender == 'male' else 0)
    Y_test = test_ori['label'].apply(lambda gender: 1
                                     if gender == 'male' else 0)

    print('prepare tokenizer')
    tokenizer = text.Tokenizer(num_words=MAX_VOCAB_SIZE)  #词汇表最多单词数
    tokenizer.fit_on_texts(
        list(train_text) + list(test_text)
    )  #Updates internal vocabulary based on a list of texts.

    # train_seq = tokenizer.texts_to_sequences(train_text)#Transforms each text in texts in a sequence of integers
    # test_seq = tokenizer.texts_to_sequences(test_text)
    #
    # X_train = sequence.pad_sequences(train_seq, maxlen=MAXLEN)#Pads each sequence to the same length (length of the longest sequence)
    # X_test = sequence.pad_sequences(test_seq, maxlen=MAXLEN)
    # print(X_train.shape)
    # print(X_test.shape)

    print('texts to sequences...')
    train_ori['seq'] = train_ori['text_list'].apply(
        lambda list: tokenizer.texts_to_sequences(list))
    test_ori['seq'] = test_ori['text_list'].apply(
        lambda list: tokenizer.texts_to_sequences(list))

    print('pad sequences...')
    train_ori['seq'] = train_ori['seq'].apply(
        lambda list: sequence.pad_sequences(list, maxlen=MAXLEN))
    test_ori['seq'] = test_ori['seq'].apply(
        lambda list: sequence.pad_sequences(list, maxlen=MAXLEN))

    # X_train = sequence.pad_sequences(train_ori['seq'], maxlen=MAXLEN)#Pads each sequence to the same length (length of the longest sequence)
    # X_test = sequence.pad_sequences(test_ori['seq'], maxlen=MAXLEN)

    print('fit to numpy...')
    X_train = np.array(list(train_ori['seq']))
    X_test = np.array(list(test_ori['seq']))

    print(X_train.shape)
    print(X_test.shape)

    return X_train, Y_train, X_test, Y_test, tokenizer

x_train = generate_new_comments(comments_train)
y_train = traindata['target']
y_train = np.rint(np.array(y_train))
y_aux_train = traindata[[
    'target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult',
    'threat'
]]
x_test = generate_new_comments(comments_test)
y_test = testdata['toxicity']
y_test = np.rint(np.array(y_test))

# Tokenize sentenses and index the words, padding each sequence to the same length 220
MAX_LEN = 220
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(x_train + x_test)

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)
print(len(tokenizer.word_index))


# build Glove dictionary
def load_emb_data(emb_path):
    emb = {}
    with open(emb_path, 'r', encoding='UTF-8') as fin:
        while True:
            line = fin.readline()
Beispiel #5
0
    "review/time", "review/summary", "review/text"
]
df = pd.read_csv("data/finemuged.csv",
                 encoding='latin1',
                 header=None,
                 names=colnames,
                 quotechar="\"").sample(100000)


def one_hot(x, maxi=5):
    arr = [0] * maxi
    arr[x - 1] = 1
    return arr


t = text.Tokenizer(10000)
X = df["review/text"].values
t.fit_on_texts(X)
X = t.texts_to_sequences(X)
X = sequence.pad_sequences(X, value=0, padding='post', maxlen=256)
y = df["review/score"].astype(int).values.reshape(-1) - 1
y = np.eye(5)[y]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
# print(X_train.shape, y_train.shape)
# print(X_test.shape, y_test.shape)
# exit(0)

sequence_length = 256
vocabulary_size = 10000
embedding_dim = 300
Beispiel #6
0
def preprocess_data(max_features=100000, maxlen=200, embed_size=300):
    #load and clean data
    train = pd.read_csv(TRAIN_FILE)
    train = train.drop(['id'], axis=1)
    train['tags'] = train['tags'].astype(str)
    train['article'] = train['article'].str.replace(
        '</p>|<p>|\r|\n|<br>|</p>|<pre>|</pre>|<code>|</code>', '')
    train['combined'] = train['title'] + ' ' + train['article']
    train.drop(['title', 'article'], axis=1, inplace=True)
    lst = [x.split(',') for x in train['tags'].str.replace('|', ',').tolist()]

    #one hot encode label (multi)
    mlb = MultiLabelBinarizer(sparse_output=True)
    y = mlb.fit_transform(lst)
    with open('multi_label_binarizer.pickle', 'wb') as handle:
        pickle.dump(mlb, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print("binarizer saved")
    del lst
    test = pd.read_csv(TEST_FILE)

    test = test.drop(['id'], axis=1)
    test['article'] = test['article'].str.replace(
        '</p>|<p>|\r|\n|<br>|</p>|<pre>|</pre>|<code>|</code>', '')

    test['combined'] = test['title'] + ' ' + test['article']

    test.drop(['title', 'article'], axis=1, inplace=True)
    X_train = train["combined"].fillna("fillna").values
    y_train = y  #train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
    del y
    X_test = test["combined"].fillna("fillna").values
    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("tokenizer saved")
    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)
    x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    gc.collect()
    del X_train, X_test

    embeddings_index = dict(
        get_coefs(*o.rstrip().rsplit(' '))
        for o in open(EMBEDDING_FILE, encoding="utf8"))

    all_embs = np.stack(embeddings_index.values())
    emb_mean, emb_std = all_embs.mean(), all_embs.std()

    del all_embs
    gc.collect()

    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std,
                                        (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    print('preprocessing done')

    return embedding_matrix, x_train, y_train, x_test, mlb
Beispiel #7
0
from keras.layers.recurrent import LSTM, GRU
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import Merge
from keras.layers import TimeDistributed, Lambda
from keras.layers import Convolution1D, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint
from keras import backend as K
from keras.layers.advanced_activations import PReLU
from keras.preprocessing import sequence, text
from keras.layers import SpatialDropout1D

data = pd.read_csv('train.csv')
y = data.is_duplicate.values

tk = text.Tokenizer(num_words=200000)

max_len = 40
tk.fit_on_texts(
    list(data.question1.values) + list(data.question2.values.astype(str)))
x1 = tk.texts_to_sequences(data.question1.values)
x1 = sequence.pad_sequences(x1, maxlen=max_len)

x2 = tk.texts_to_sequences(data.question2.values.astype(str))
x2 = sequence.pad_sequences(x2, maxlen=max_len)

# Dividing the datasets into train and test splits
from sklearn.model_selection import train_test_split

x1_train, x1_test, y_train, y_test = train_test_split(x1, y, test_size=0.2)
x2_train, x2_test, _, _ = train_test_split(x2, y, test_size=0.2)
Beispiel #8
0
def index():
    if request.method == 'GET':
        return render_template("index.html")
    elif request.method == 'POST':
        model = load_model(modelPath, custom_objects={'Attention': Attention})
        search_text = request.form['query']
        try:
            response = es_client.search(index=ES_settings['ES_index'],
                                        body={
                                            "query": {
                                                "match": {
                                                    "doc": search_text
                                                }
                                            },
                                            "size": 100
                                        })

        except:
            jsonrespons = []
            return render_template("index.html")
        if response['timed_out'] == True:
            jsonresponse = []
            return render_template("index.html")
        else:
            search_results = response['hits']['hits']
            jsonresponse = []
            urllist = []
            for X in search_results:
                plaintext = X['_source']['doc']
                url = X['_source']['url']
                tok_sentence = sent_tokenize(plaintext)
                jsonresponse += tok_sentence
                tok_sentence_lenght = len(tok_sentence)
                urllist += tok_sentence_lenght * [url]

            max_features = 100000
            maxlen = 150
            embed_size = 300
            tok = text.Tokenizer(num_words=max_features, lower=True)
            total_sentence = len(jsonresponse)
            tok.fit_on_texts(jsonresponse + [search_text] * total_sentence)
            word_index = tok.word_index
            X_test = tok.texts_to_sequences(jsonresponse)
            topic_test = tok.texts_to_sequences([search_text] * total_sentence)
            X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
            topic_test = sequence.pad_sequences(topic_test, maxlen=maxlen)
            del tok
            gc.collect()
            embedding_matrix = build_matrix(word_index, embeddings_index)
            y_pred = model.predict([X_test, topic_test],
                                   verbose=1,
                                   batch_size=512)
            K.clear_session()
            index = np.arange(len(y_pred))
            y_pred = np.c_[index, y_pred]
            ratings = pd.DataFrame({
                'index': y_pred[:, 0],
                'no_Argument': y_pred[:, 1],
                'Argument_for': y_pred[:, 2],
                'Argument_against': y_pred[:, 3]
            })

            ratings['res'] = ratings.apply(lambda x: compare(
                x['no_Argument'], x['Argument_for'], x['Argument_against']),
                                           axis=1)

            sortedforratings = ratings[ratings['res'] ==
                                       'Argument_for'].sort_values(
                                           by='Argument_for', ascending=False)
            sortedforratings['text'] = sortedforratings.apply(
                lambda x: jsonresponse[int(x['id'])], axis=1)
            sortedforratings['link'] = sortedforratings.apply(
                lambda x: urllist[int(x['id'])], axis=1)
            sortedforratings['Argument_for'] = sortedforratings[
                'Argument_for'].apply(lambda x: truncted_float(x))

            sortedAgainstRatings = ratings[ratings['res'] ==
                                           'Argument_against'].sort_values(
                                               by='Argument_against',
                                               ascending=False)
            sortedAgainstRatings['Argument_against'] = sortedAgainstRatings[
                'Argument_against'].apply(lambda x: truncted_float(x))

            sortedAgainstRatings['text'] = sortedAgainstRatings.apply(
                lambda x: jsonresponse[int(x['id'])], axis=1)
            sortedAgainstRatings['link'] = sortedAgainstRatings.apply(
                lambda x: urllist[int(x['id'])], axis=1)
            return render_template("result.html",
                                   for_output=sortedforratings,
                                   against_outout=sortedAgainstRatings)
    def create_train_data(self):
        # 读取输入输出
        train_comments = self.train_df["comment_text"].astype(str)
        train_label = self.train_df["target"].values
        train_type_labels = self.train_df[self.toxicity_type_list].values

        # 身份原始值
        train_identity_values = self.train_df[self.identity_list].fillna(
            0.).values
        # 所有身份原始值之和
        train_identity_sum = train_identity_values.sum(axis=1)
        # 将身份之和限制在1以下(sigmoid)
        train_identity_sum_label = np.where(train_identity_sum > 1, 1,
                                            train_identity_sum)
        # 身份01值
        train_identity_binary = copy.deepcopy(
            self.train_df[self.identity_list])
        for column in self.identity_list:
            train_identity_binary[column] = np.where(
                train_identity_binary[column] > 0.5, 1, 0)
        # 身份01值有一个就算1
        train_identity_binary_sum = train_identity_binary.sum(axis=1)
        train_identity_or_binary = np.where(train_identity_binary_sum >= 1, 1,
                                            0)
        # 所有身份标签
        train_identity_type_labels = train_identity_values
        train_identity_type_binary_lables = train_identity_binary
        train_identity_sum_label = train_identity_sum_label
        train_identity_binary_label = train_identity_or_binary

        test_comments = self.test_df["comment_text"].astype(str)
        # tokenizer 训练
        tokenizer = text.Tokenizer(filters=self.stopwords)
        tokenizer.fit_on_texts(
            list(train_comments) + list(test_comments)
        )  # train_comments 是 dataframe 的一列,是 Series 类, list(train_comments) 直接变成 list
        # tokenization
        train_tokens = tokenizer.texts_to_sequences(
            train_comments)  # 可以给 Series 也可以给 list?
        test_tokens = tokenizer.texts_to_sequences(test_comments)
        # 用 sequence 类补到定长
        train_tokens = sequence.pad_sequences(train_tokens,
                                              maxlen=self.max_len)
        test_tokens = sequence.pad_sequences(test_tokens, maxlen=self.max_len)
        # 划分训练集和验证集
        valid_tokens = train_tokens[self.train_len:]
        valid_label = train_label[self.train_len:]
        valid_type_labels = train_type_labels[self.train_len:]
        train_tokens = train_tokens[:self.train_len]
        train_label = train_label[:self.train_len]
        train_type_labels = train_type_labels[:self.train_len]

        # 划分身份标签
        valid_identity_type_labels = train_identity_type_labels[self.
                                                                train_len:]
        train_identity_type_labels = train_identity_type_labels[:self.
                                                                train_len]
        valid_identity_type_binary_lables = train_identity_type_binary_lables[
            self.train_len:]
        train_identity_type_binary_lables = train_identity_type_binary_lables[:
                                                                              self
                                                                              .
                                                                              train_len]
        valid_identity_sum_label = train_identity_sum_label[self.train_len:]
        train_identity_sum_label = train_identity_sum_label[:self.train_len]
        valid_identity_binary_label = train_identity_binary_label[self.
                                                                  train_len:]
        train_identity_binary_label = train_identity_binary_label[:self.
                                                                  train_len]

        # 数据集
        dataset = {
            "train_tokens": train_tokens,
            "train_label": train_label,
            "train_type_labels": train_type_labels,
            "valid_tokens": valid_tokens,
            "valid_label": valid_label,
            "valid_type_labels": valid_type_labels,
            "test_tokens": test_tokens,
            "tokenizer": tokenizer,
            "valid_identity_type_labels": valid_identity_type_labels,
            "train_identity_type_labels": train_identity_type_labels,
            "valid_identity_type_binary_lables":
            valid_identity_type_binary_lables,
            "train_identity_type_binary_lables":
            train_identity_type_binary_lables,
            "valid_identity_sum_label": valid_identity_sum_label,
            "train_identity_sum_label": train_identity_sum_label,
            "valid_identity_binary_label": valid_identity_binary_label,
            "train_identity_binary_label": train_identity_binary_label
        }
        return dataset
Beispiel #10
0
def trainModel(model,
               inputs,
               vocabSize,
               ansLen=100,
               batch_size=32,
               epochs=200,
               validation_split=0.2,
               fileName="summarizer",
               cv=False,
               es=False,
               halfInpCV=False):
    """
        Trains the model
        * inputs should be a list of answer groups. Each answer group should be
          represented by a list of strings where the first string is the accepted answer

        returns: model and tokenizer
    """
    print("Creating Tokenizer")
    tok = text.Tokenizer(vocabSize - 1, lower=False, oov_token="UNK")
    tok.fit_on_texts(ans for ansGroup in inputs for ans in ansGroup)

    fout = open(fileName + '.tok', 'wb')
    pickle.dump(tok.to_json(), fout)
    fout.close()

    print("Preparing training data")
    inputAns = []
    outputAns = []
    for ansGroup in tqdm(inputs):
        numAnswers = len(ansGroup) - 1
        tokAns = tok.texts_to_sequences(ansGroup)
        #restrict to 100 tokens from each anwer and concatenate input answers together
        inp = [w for seq in tokAns[1:] for w in seq[:ansLen]]
        outp = tokAns[0][:ansLen]

        inputAns.append(inp)
        outputAns.append(outp)

    print("Padding/trimming inputs")
    inputAns = sequence.pad_sequences(inputAns,
                                      maxlen=ansLen * numAnswers,
                                      padding="post",
                                      truncating="post")
    outputAns = sequence.pad_sequences(outputAns,
                                       maxlen=ansLen,
                                       padding="post",
                                       truncating="post")

    def f(i):
        x = [0] * vocabSize
        x[i] = 1
        return x

    print("Finalizing training output")
    outNP = zeros((len(outputAns), ansLen, vocabSize))
    for i, doc in enumerate(tqdm(outputAns)):
        for j, word in enumerate(doc):
            outNP[i][j][word] = 1
    outputAns = outNP

    if (cv):
        print("Performing Cross-Validation")
        factor = 2 if halfInpCV else 1
        scores = crossValidate(model.to_json(),
                               inputAns[:int(len(inputAns) / factor)],
                               outputAns[:int(len(outputAns) / factor)])
        print(scores)
        scoreFile = open(fileName + ".cvscores", "wb")
        pickle.dump(scores, scoreFile)
        scoreFile.close()

    print("Training Model")
    callbacks = [
        ModelCheckpoint(fileName +
                        "{epoch:02d}_{loss:.2f}_{val_loss:.2f}.model",
                        verbose=1,
                        period=5)
    ]
    if (es):
        callbacks.append(
            EarlyStopping(monitor="val_loss",
                          patience=2,
                          verbose=1,
                          mode="min",
                          restore_best_weights=True))

    model.fit(inputAns,
              outputAns,
              batch_size=batch_size,
              epochs=epochs,
              verbose=1,
              validation_split=validation_split,
              callbacks=callbacks)

    return model, tok
Beispiel #11
0
    return model


train_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

x_train = train_df[TEXT_COLUMN].astype(str)
y_train = train_df[TARGET_COLUMN].values
y_aux_train = train_df[AUX_COLUMNS].values
x_test = test_df[TEXT_COLUMN].astype(str)

for column in IDENTITY_COLUMNS + [TARGET_COLUMN]:
    train_df[column] = np.where(train_df[column] >= 0.5, True, False)

tokenizer = text.Tokenizer(filters=CHARS_TO_REMOVE, lower=False)
tokenizer.fit_on_texts(list(x_train) + list(x_test))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

sample_weights = np.ones(len(x_train), dtype=np.float32)
sample_weights += train_df[IDENTITY_COLUMNS].sum(axis=1)
sample_weights += train_df[TARGET_COLUMN] * (~train_df[IDENTITY_COLUMNS]).sum(axis=1)
sample_weights += (~train_df[TARGET_COLUMN]) * train_df[IDENTITY_COLUMNS].sum(axis=1) * 5
sample_weights /= sample_weights.mean()

embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)
Beispiel #12
0
def create_tokenizer(lines):
    tokenizer = text.Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
Beispiel #13
0
(X_train, y_train), (X_test, y_test) = imdb.load_imdb()

# set parameters:
vocab_size = 1000
maxlen = 300
batch_size = 32
embedding_dims = 50
filters = 10
kernel_size = 3
hidden_dims = 10
epochs = 10

# Use tokenization i.e to convert to matrix/vector

tokenizer = text.Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_matrix(X_train)
X_test = tokenizer.texts_to_matrix(X_test)

# Padding

X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

# Building model( Embedding+CNN 1D+LSTM)

model = Sequential()
model.add(Embedding(vocab_size, embedding_dims, input_length=maxlen))
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu'))
model.add(MaxPooling1D())  # to down-sample an input representation
Beispiel #14
0
train_text = []
for t in train['comment_text'].fillna('foobar'):
    t = pre_process(t)
    train_text.append(t)
train_text = pd.Series(train_text).astype(str)

test_text = []
for t in test['comment_text'].fillna('foobar'):
    t = pre_process(t)
    test_text.append(t)
test_text = pd.Series(test_text).astype(str)

dict_size = 50000
max_len = 200

tokenizer = text.Tokenizer(num_words=dict_size)
tokenizer.fit_on_texts(list(train_text) + list(test_text))
train_seq = tokenizer.texts_to_sequences(train_text)
test_seq = tokenizer.texts_to_sequences(test_text)
train_X = sequence.pad_sequences(train_seq, maxlen=max_len)
train_Y = train[list(train)[2:]].values  # list to get column names
test_X = sequence.pad_sequences(test_seq, maxlen=max_len)

# Persistence
pd.DataFrame(train_X).to_csv(path_prefix + 'train_X.csv', header=False, index=False)
pd.DataFrame(test_X).to_csv(path_prefix + 'test_X.csv', header=False, index=False)
print('Finished!')

# The model
print("Building and Training a model ...", end=' ')
embed_size = 100
torch.backends.cudnn.deterministic=True

corpus=[]
file=open('OLID/olid-training-v1.0.tsv','r',encoding='UTF-8')
y_train=[]
for i in file:
    if(len(corpus)==10):print(corpus[-1])
    a=i.split('\t')
    corpus.append(a[1].lower())
    if a[2]=="OFF":y_train.append(1)
    else:y_train.append(0)
file.close()

#print(corpus[1:4])

tokenizer = text.Tokenizer(num_words = 1000)
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
x_train = tokenizer.texts_to_sequences(corpus)
x_train = sequence.pad_sequences(x_train, maxlen = 30)






EMBEDDING_FILE = glove_para
embeddings_index = {}
for i, line in enumerate(open(EMBEDDING_FILE,encoding="utf-8")):
    val = line.split()
    embeddings_index[val[0]] = np.asarray(val[1:], dtype='float32')
Beispiel #16
0
def generateOOVEmbeddings():
    # read the (DL cleaned) dataset and build the vocabulary
    print('loading dataframes...')
    train_df = pd.read_csv('../data/training/train2.cleaned.dl.csv')
    test_df = pd.read_csv('../data/eval/test2.cleaned.dl.csv')

    # ps: forget memory and runtime, it's python here :D
    list_sentences_train = train_df["comment_text"].values
    list_sentences_test = test_df["comment_text"].values
    list_sentences_all = np.concatenate(
        [list_sentences_train, list_sentences_test])

    tokenizer = text.Tokenizer(num_words=400000)
    tokenizer.fit_on_texts(list(list_sentences_all))
    print('word_index size:', len(tokenizer.word_index), 'words')
    word_index = tokenizer.word_index

    # load fastText - only the words
    print('loading fastText embeddings...')
    voc = set()
    f = open('/mnt/data/wikipedia/embeddings/crawl-300d-2M.vec')
    begin = True
    for line in f:
        if begin:
            begin = False
        else:
            values = line.split()
            word = ' '.join(values[:-300])
            voc.add(word)
    f.close()
    print('fastText embeddings:', len(voc), 'words')

    oov = []
    for tokenStr in word_index:
        if not tokenStr in voc:
            oov.append(tokenStr)

    print('fastText embeddings:', len(oov), 'out-of-vocabulary')

    with open("../data/training/oov-fastText.txt", "w") as oovFile:
        for w in oov:
            oovFile.write(w)
            oovFile.write('\n')
    oovFile.close()

    # load gloves - only the words
    print('loading gloves embeddings...')
    voc = set()
    f = open('/mnt/data/wikipedia/embeddings/glove.840B.300d.txt')
    for line in f:
        values = line.split()
        word = ' '.join(values[:-300])
        voc.add(word)
    f.close()
    print('gloves embeddings:', len(voc), 'words')

    oov = []
    for tokenStr in word_index:
        if not tokenStr in voc:
            oov.append(tokenStr)

    print('gloves embeddings:', len(oov), 'out-of-vocabulary')

    with open("../data/training/oov-gloves.txt", "w") as oovFile:
        for w in oov:
            oovFile.write(w)
            oovFile.write('\n')
    oovFile.close()

    # load word2vec - only the words
    print('loading word2vec embeddings...')
    voc = set()
    f = open(
        '/mnt/data/wikipedia/embeddings/GoogleNews-vectors-negative300.vec')
    begin = True
    for line in f:
        if begin:
            begin = False
        else:
            values = line.split()
            word = ' '.join(values[:-300])
            voc.add(word)
    f.close()
    print('word2vec embeddings:', len(voc), 'words')

    oov = []
    for tokenStr in word_index:
        if not tokenStr in voc:
            oov.append(tokenStr)

    print('word2vec embeddings:', len(oov), 'out-of-vocabulary')

    with open("../data/training/oov-w2v.txt", "w") as oovFile:
        for w in oov:
            oovFile.write(w)
            oovFile.write('\n')
    oovFile.close()

    # load numberbatch - only the words
    print('loading numberbatch embeddings...')
    voc = set()
    f = open('/mnt/data/wikipedia/embeddings/numberbatch-en-17.06.txt')
    begin = True
    for line in f:
        if begin:
            begin = False
        else:
            values = line.split()
            word = ' '.join(values[:-300])
            voc.add(word)
    f.close()
    print('numberbatch embeddings:', len(voc), 'words')

    oov = []
    for tokenStr in word_index:
        if not tokenStr in voc:
            oov.append(tokenStr)

    print('numberbatch embeddings:', len(oov), 'out-of-vocabulary')

    with open("../data/training/oov-numberbatch.txt", "w") as oovFile:
        for w in oov:
            oovFile.write(w)
            oovFile.write('\n')
    oovFile.close()
Beispiel #17
0
def train():
    all_data = np.load('./temp/train_data.npy')
    all_label = np.load('./temp/label_v.npy')

    ##### generate label #####
    print('label generate start')
    tokenizer = text.Tokenizer(filters='\n')
    tokenizer.fit_on_texts(all_label)

    all_label = tokenizer.texts_to_sequences(all_label)
    all_label = np.array(all_label, dtype='int')
    all_label = all_label - 1
    all_label = np_utils.to_categorical(all_label)
    print('label generate end')
    ##### generate label #####

    ##### suffle #####
    all_data = all_data.reshape(len(all_data), width * 2, n_fil, 1)

    index = list(range(0, len(all_data)))
    np.random.seed(1024)
    np.random.shuffle(index)
    all_data = all_data[index]
    all_label = all_label[index]
    ##### suffle #####

    ##### model structure #####

    ##### layer1 #####
    model = Sequential()
    model.add(
        Conv2D(filters=48,
               kernel_size=(3, 3),
               input_shape=(width * 2, n_fil, 1),
               padding='same'))
    model.add(Activation('relu'))
    model.add(Conv2D(filters=48, kernel_size=(3, 3), padding='same'))
    model.add(Activation('relu'))
    model.add(Conv2D(filters=48, kernel_size=(3, 3), padding='same'))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    ##### layer1 #####

    ##### layer2 #####
    model.add(Conv2D(filters=96, kernel_size=(3, 3), padding='same'))
    model.add(Activation('relu'))
    model.add(Conv2D(filters=96, kernel_size=(3, 3), padding='same'))
    model.add(Activation('relu'))
    model.add(Dropout(rate=0.3))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    ##### layer2 #####

    ##### layer3 #####
    model.add(Conv2D(filters=192, kernel_size=(3, 3), padding='same'))
    model.add(Activation('relu'))
    model.add(Conv2D(filters=192, kernel_size=(3, 3), padding='same'))
    model.add(Activation('relu'))
    model.add(Dropout(rate=0.3))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    ##### layer3 #####

    model.add(Flatten())

    model.add(Dense(1024))
    model.add(BatchNormalization())
    model.add(Activation('relu'))

    model.add(Dense(512))
    model.add(BatchNormalization())
    model.add(Activation('relu'))

    model.add(Dense(41, activation='softmax'))
    model.compile(loss="categorical_crossentropy",
                  optimizer="adam",
                  metrics=["accuracy"])
    ##### model structure #####

    history = model.fit(all_data,
                        all_label,
                        batch_size=200,
                        epochs=15,
                        verbose=1,
                        validation_split=0.05,
                        shuffle=True,
                        initial_epoch=0)

    model.save('reproduce.h5')

    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
Beispiel #18
0
    def create_dataloader(self):
        # 读取输入输出
        train_comments = self.train_df["comment_text"].astype(str)
        train_label = self.train_df["target"].values
        train_type_labels = self.train_df[self.toxicity_type_list].values

        # 新的 np 任务
        train_np_labels = np.zeros((len(self.train_df), 4))
        train_np_identity_labels = np.zeros(
            (len(self.train_df), len(self.identity_list) * 4))
        train_df_copy = self.train_df[self.identity_list + ["target"]]
        for column in self.identity_list + ["target"]:
            train_df_copy[column] = np.where(train_df_copy[column] > 0.5, True,
                                             False)
        pp_label_bool = train_df_copy["target"] & np.where(
            train_df_copy[self.identity_list].sum(axis=1) > 0, True, False)
        np_label_bool = ~train_df_copy["target"] & np.where(
            train_df_copy[self.identity_list].sum(axis=1) > 0, True, False)
        pn_label_bool = train_df_copy["target"] & np.where(
            (train_df_copy[self.identity_list]).sum(axis=1) == 0, True, False)
        nn_label_bool = ~train_df_copy["target"] & np.where(
            (train_df_copy[self.identity_list]).sum(axis=1) == 0, True, False)
        train_np_labels[:, 0] = np.where(pp_label_bool > 0, 1, 0)
        train_np_labels[:, 1] = np.where(np_label_bool > 0, 1, 0)
        train_np_labels[:, 2] = np.where(pn_label_bool > 0, 1, 0)
        train_np_labels[:, 3] = np.where(nn_label_bool > 0, 1, 0)
        for i, column in enumerate(self.identity_list):
            pp_label_bool = train_df_copy["target"] & train_df_copy[column]
            np_label_bool = ~train_df_copy["target"] & train_df_copy[column]
            pn_label_bool = train_df_copy["target"] & (~train_df_copy[column])
            nn_label_bool = ~train_df_copy["target"] & (~train_df_copy[column])
            train_np_identity_labels[:, i * 4 + 0] = np.where(
                pp_label_bool > 0, 1, 0)
            train_np_identity_labels[:, i * 4 + 1] = np.where(
                np_label_bool > 0, 1, 0)
            train_np_identity_labels[:, i * 4 + 2] = np.where(
                pn_label_bool > 0, 1, 0)
            train_np_identity_labels[:, i * 4 + 3] = np.where(
                nn_label_bool > 0, 1, 0)

        # 身份原始值
        train_identity_values = self.train_df[self.identity_list].fillna(
            0.).values
        # 所有身份原始值之和
        train_identity_sum = train_identity_values.sum(axis=1)
        # 将身份之和限制在1以下(sigmoid)
        train_identity_sum_label = np.where(train_identity_sum > 1, 1,
                                            train_identity_sum)
        # 身份01值
        train_identity_binary = copy.deepcopy(
            self.train_df[self.identity_list])
        for column in self.identity_list:
            train_identity_binary[column] = np.where(
                train_identity_binary[column] > 0.5, 1, 0)
        # 身份01值有一个就算1
        train_identity_binary_sum = train_identity_binary.sum(axis=1)
        train_identity_or_binary = np.where(train_identity_binary_sum >= 1, 1,
                                            0)
        # 所有身份标签
        train_identity_type_labels = train_identity_values
        train_identity_type_binary_lables = train_identity_binary
        train_identity_sum_label = train_identity_sum_label
        train_identity_binary_label = train_identity_or_binary

        # tokenizer 训练
        test_comments = self.test_df["comment_text"].astype(str)
        tokenizer = text.Tokenizer(filters=self.stopwords)
        tokenizer.fit_on_texts(
            list(train_comments) + list(test_comments)
        )  # train_comments 是 dataframe 的一列,是 Series 类, list(train_comments) 直接变成 list
        # tokenization
        train_tokens = tokenizer.texts_to_sequences(
            train_comments)  # 可以给 Series 也可以给 list?
        test_tokens = tokenizer.texts_to_sequences(test_comments)
        # 用 sequence 类补到定长
        train_tokens = sequence.pad_sequences(train_tokens,
                                              maxlen=self.max_len)
        test_tokens = sequence.pad_sequences(test_tokens, maxlen=self.max_len)
        # 划分训练集和验证集
        valid_tokens = train_tokens[self.train_len:]
        valid_label = train_label[self.train_len:]
        valid_type_labels = train_type_labels[self.train_len:]
        train_tokens = train_tokens[:self.train_len]
        train_label = train_label[:self.train_len]
        train_type_labels = train_type_labels[:self.train_len]
        valid_identity_type_labels = train_identity_type_labels[self.
                                                                train_len:]
        train_identity_type_labels = train_identity_type_labels[:self.
                                                                train_len]
        valid_identity_type_binary_lables = train_identity_type_binary_lables[
            self.train_len:]
        train_identity_type_binary_lables = train_identity_type_binary_lables[:
                                                                              self
                                                                              .
                                                                              train_len]
        valid_identity_sum_label = train_identity_sum_label[self.train_len:]
        train_identity_sum_label = train_identity_sum_label[:self.train_len]
        valid_identity_binary_label = train_identity_binary_label[self.
                                                                  train_len:]
        train_identity_binary_label = train_identity_binary_label[:self.
                                                                  train_len]
        valid_np_labels = train_np_labels[self.train_len:]
        train_np_labels = train_np_labels[:self.train_len]
        valid_np_identity_labels = train_np_identity_labels[self.train_len:]
        train_np_identity_labels = train_np_identity_labels[:self.train_len]

        # 计算样本权重
        target_weight, aux_weight, identity_weight, np_weight, np_identity_weight = self.cal_sample_weights(
        )

        #train_np_labels
        #train_np_identity_labels
        # 将符号化数据转成 tensor
        train_x_tensor = torch.tensor(train_tokens, dtype=torch.long)
        valid_x_tensor = torch.tensor(valid_tokens, dtype=torch.long)
        train_y_tensor = torch.tensor(np.hstack([
            train_label[:, np.newaxis], train_type_labels,
            train_identity_type_labels, train_np_labels
        ]),
                                      dtype=torch.float32)
        valid_y_tensor = torch.tensor(np.hstack([
            valid_label[:, np.newaxis], valid_type_labels,
            valid_identity_type_labels, valid_np_labels
        ]),
                                      dtype=torch.float32)
        target_weight_tensor = torch.tensor(target_weight, dtype=torch.float32)
        aux_weight_tensor = torch.tensor(aux_weight, dtype=torch.float32)
        identity_weight_tensor = torch.tensor(identity_weight,
                                              dtype=torch.float32)
        np_weight_tensor = torch.tensor(np_weight, dtype=torch.float32)
        np_identity_weight_tensor = torch.tensor(np_identity_weight,
                                                 dtype=torch.float32)
        if torch.cuda.is_available():
            train_x_tensor = train_x_tensor.cuda()
            valid_x_tensor = valid_x_tensor.cuda()
            train_y_tensor = train_y_tensor.cuda()
            valid_y_tensor = valid_y_tensor.cuda()
            target_weight_tensor = target_weight_tensor.cuda()
            aux_weight_tensor = aux_weight_tensor.cuda()
            identity_weight_tensor = identity_weight_tensor.cuda()
            np_weight_tensor = np_weight_tensor.cuda()
            np_identity_weight_tensor = np_identity_weight_tensor.cuda()
        # 将 tensor 转成 dataset,训练数据和标签一一对应,用 dataloader 加载的时候 dataset[:-1] 是 x,dataset[-1] 是 y
        train_dataset = data.TensorDataset(train_x_tensor, train_y_tensor,
                                           target_weight_tensor,
                                           aux_weight_tensor,
                                           identity_weight_tensor,
                                           np_weight_tensor,
                                           np_identity_weight_tensor)
        valid_dataset = data.TensorDataset(valid_x_tensor, valid_y_tensor)
        # 将 dataset 转成 dataloader
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=self.batch_size,
                                                   shuffle=True)
        valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                                   batch_size=self.batch_size,
                                                   shuffle=False)
        # 返回训练数据
        return train_loader, valid_loader, tokenizer
Beispiel #19
0
def training():
    # One Epoch is when an Entire dataset is passed forward and backward through the neural network only ONCE!
    epochs = 1000
    #Total number of training example present in a sigle batch
    batch_size = 32

    training_percentage = 0.8

    # file path
    file_path = os.path.dirname(os.path.abspath(inspect.stack()[0][1]))
    savings = file_path + '\\savings\\'
    dataset = file_path + '\\dataset\\'
    tokenizer_file = os.path.join(savings, 't.pickle')
    encoder_file = os.path.join(savings, 'e.pickle')
    class_file = os.path.join(savings, 'c.pickle')
    model_file = os.path.join(savings, 'm.h5')
    dataset_file = os.path.join(dataset, 'Youtube01-Psy.csv')

    #Create savings folder if not exists
    os.makedirs(os.path.dirname(tokenizer_file), exist_ok=True)

    removeFile(tokenizer_file)
    removeFile(encoder_file)
    removeFile(class_file)
    removeFile(model_file)

    data = pd.read_csv(dataset_file)

    print(data.head())

    training_size = int(len(data) * training_percentage)

    train_content = data['CONTENT'][:training_size]
    train_class = data['CLASS'][:training_size]

    test_content = data['CONTENT'][training_size:]
    test_class = data['CLASS'][training_size:]

    number_words_dataset = countingWords(train_content)

    tokenize = text.Tokenizer(num_words=number_words_dataset, char_level=False)

    tokenize.fit_on_texts(train_content)

    # tf-idf
    x_train = tokenize.texts_to_matrix(train_content, mode='tfidf')
    x_test = tokenize.texts_to_matrix(test_content, mode='tfidf')

    with open(tokenizer_file, 'wb') as handle:
        pickle.dump(tokenize, handle, protocol=pickle.HIGHEST_PROTOCOL)

    encoder = LabelEncoder()
    encoder.fit(train_class)

    y_train = encoder.transform(train_class)
    y_test = encoder.transform(test_class)

    with open(encoder_file, 'wb') as handle:
        pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

    num_classes = np.max(y_train + 1)

    with open(class_file, 'wb') as handle:
        pickle.dump(num_classes, handle)

    y_train = utils.to_categorical(y_train, num_classes)
    y_test = utils.to_categorical(y_test, num_classes)

    #############################################################################################
    model = Sequential()
    model.add(
        Dense(num_classes * 8,
              input_shape=(number_words_dataset, ),
              activation='relu'))
    model.add(Dropout(0.5))

    model.add(Dense(num_classes * 4, activation='relu'))
    model.add(Dropout(0.2))

    model.add(Dense(num_classes * 2, activation='relu'))
    model.add(Dropout(0.2))

    #output layer
    model.add(Dense(num_classes, activation='softmax'))
    #############################################################################################

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    # model.compile(loss= 'categorical_crossentropy',
    #                 optimizer= 'adam',
    #                 metrics=['accuracy'])

    stopper = keras.callbacks.EarlyStopping(monitor='val_loss',
                                            min_delta=0,
                                            patience=2,
                                            verbose=1,
                                            mode='auto',
                                            baseline=None)

    model_history = model.fit(x_train,
                              y_train,
                              batch_size=batch_size,
                              epochs=epochs,
                              verbose=1,
                              validation_split=0.1,
                              callbacks=[stopper])

    score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
    print("\n\n Test score: ", score[0])
    print("\n\n Test accuracy: ", score[1])

    model.save(model_file)

    # plot with losses
    loss = model_history.history['loss']
    val_loss = model_history.history['val_loss']
    plt.plot(loss)
    plt.plot(val_loss)
    plt.legend(['loss', 'val_loss'])
    plt.ylabel('Loss', fontsize=15)
    plt.xlabel('Epochs', fontsize=15)
    plt.show()

    text_labels = encoder.classes_
    y_softmax = model.predict(x_test)
    y_test_1d = []
    y_pred_1d = []

    for i in range(len(y_test)):
        probs = y_test[i]
        index_arr = np.nonzero(probs)
        one_hot_index = index_arr[0].item(0)
        y_test_1d.append(one_hot_index)

    for i in range(0, len(y_softmax)):
        probs = y_softmax[i]
        predicted_index = np.argmax(probs)
        y_pred_1d.append(predicted_index)

    def plot_confusion_matrix(cm,
                              classes,
                              title='Confusion matrix',
                              cmap=plt.cm.Blues):
        """
        This function prints and plots the confusion matrix.
        Normalization can be applied by setting `normalize=True`.
        """
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title, fontsize=20)
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45, fontsize=15)
        plt.yticks(tick_marks, classes, fontsize=15)

        fmt = '.2f'
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j,
                     i,
                     format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

        plt.ylabel('True label', fontsize=20)
        plt.xlabel('Predicted label', fontsize=20)

    cnf_matrix = confusion_matrix(y_test_1d, y_pred_1d)
    plt.figure(figsize=(44, 37))
    plot_confusion_matrix(cnf_matrix,
                          classes=text_labels,
                          title="Confusion matrix")
    plt.show()
Beispiel #20
0
batch_size = 1000

num_samples = len(Y)

# input_shape = X[0].size
epochs = 100
num_layers = 2


train_sent,test_sent, y_train, y_test = train_test_split(X, Y, shuffle=True, train_size=0.9)
print(len(X), len(train_sent))


max_words = 3000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

tokenize.fit_on_texts(train_sent) # only fit on train
# pickle.dump(tokenize, open('models/CNN/modelTokenizer.h5', 'wb+'))
x_train = tokenize.texts_to_matrix(train_sent)
x_test = tokenize.texts_to_matrix(test_sent)

print(len(x_train),len(x_train[0]))

# encoder = LabelEncoder()
# encoder.fit(train_tags)
# y_train = encoder.transform(train_tags)
# y_test = encoder.transform(test_tags)
#
# num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
embed_size=300

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))
            
tok=text.Tokenizer(num_words=max_features,lower=True)
tok.fit_on_texts(list(X_train))
X_train=tok.texts_to_sequences(X_train)
x_train=sequence.pad_sequences(X_train,maxlen=maxlen)

embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
word_index = tok.word_index
#prepare embedding matrix
num_words = min(max_features, len(word_index) + 1)
def experiment(dev_id, model_dir):
    print 80 * "="
    print "INITIALIZING"
    print 80 * "="

    df_train = []
    df_train_idx = filter(lambda x: x != dev_id, range(1, 11))
    for i in df_train_idx:
        df_train.append(
            pd.read_csv(os.path.join('split', 'train-' + str(i) + '.csv')))
    df_train = pd.concat(df_train)
    # df_train = df_train[:80]
    X_train_raw = map(lambda t: normalize(t),
                      df_train["comment_text"].fillna('').values)
    y_train = df_train[[
        "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
    ]].values
    print "Finish loading training data"

    df_dev = pd.read_csv(os.path.join('split',
                                      'train-' + str(dev_id) + '.csv'))
    # df_dev = df_dev[:200]
    X_dev_raw = map(lambda t: normalize(t),
                    df_dev["comment_text"].fillna('').values)
    y_dev = df_dev[[
        "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
    ]].values
    print "Finish loading dev data"

    df_test = pd.read_csv(os.path.join('test.csv'))
    # df_test = df_test[:200]
    X_test_raw = map(lambda t: normalize(t),
                     df_test["comment_text"].fillna('').values)
    print "Finish loading test data"

    tokenizer = text.Tokenizer(num_words=MAX_FEATURES, char_level=True)
    tokenizer.fit_on_texts(
        list(X_train_raw) + list(X_dev_raw) + list(X_test_raw))
    X_train = sequence.pad_sequences(tokenizer.texts_to_sequences(X_train_raw),
                                     maxlen=MAX_SEQUENCE_LENGTH)
    X_dev = sequence.pad_sequences(tokenizer.texts_to_sequences(X_dev_raw),
                                   maxlen=MAX_SEQUENCE_LENGTH)
    X_test = sequence.pad_sequences(tokenizer.texts_to_sequences(X_test_raw),
                                    maxlen=MAX_SEQUENCE_LENGTH)

    word_index = tokenizer.word_index
    print word_index
    valid_features = min(MAX_FEATURES, len(word_index)) + 1
    print valid_features
    embeddings_matrix = np.r_[np.zeros((1, valid_features - 1)),
                              np.eye(valid_features - 1, dtype=int)]
    print embeddings_matrix

    def get_model():
        inp = Input(shape=(MAX_SEQUENCE_LENGTH, ))
        x = Embedding(valid_features,
                      valid_features - 1,
                      weights=[embeddings_matrix],
                      trainable=False)(inp)
        x_in = SpatialDropout1D(0.5)(x)
        x = Bidirectional(
            GRU(int(sys.argv[1].split('_')[2]),
                return_sequences=True,
                recurrent_dropout=float(sys.argv[1].split('_')[0])))(x_in)
        x = Bidirectional(
            GRU(int(sys.argv[1].split('_')[2]),
                return_sequences=True,
                recurrent_dropout=float(sys.argv[1].split('_')[1])))(x)
        x_raw = Dense(int(sys.argv[1].split('_')[2]) * 2,
                      activation='relu',
                      kernel_initializer='glorot_normal')(x_in)
        x = Add()([x, x_raw])
        x = Activation('relu')(x)
        avg_pool = GlobalAveragePooling1D()(x)
        max_pool = GlobalMaxPooling1D()(x)
        conc = concatenate([avg_pool, max_pool])
        outp = Dense(6, activation="sigmoid")(conc)
        model = Model(inputs=inp, outputs=outp)
        model.compile(loss='binary_crossentropy',
                      optimizer=Adam(lr=1e-3),
                      metrics=['accuracy'])

        return model

    model = get_model()

    ra_val = RocAucMetricCallback()  # include it before EarlyStopping!
    filepath = os.path.join(model_dir, "weights_base.best.hdf5")
    checkpoint = ModelCheckpoint(filepath,
                                 monitor='roc_auc_val',
                                 verbose=2,
                                 save_best_only=True,
                                 mode='max')
    early = EarlyStopping(monitor="roc_auc_val", mode="max", patience=5)
    callbacks_list = [ra_val, checkpoint, early]

    model.fit(X_train,
              y_train,
              batch_size=128,
              epochs=8,
              validation_data=(X_dev, y_dev),
              callbacks=callbacks_list,
              verbose=2)
    # 注意:要加载保存的最优模型
    model.load_weights(filepath)

    y_train_predict = model.predict(X_train,
                                    batch_size=INFERENCE_BATCH_SIZE,
                                    verbose=2)
    submission = pd.DataFrame.from_dict({'id': df_train['id']})
    submission['comment_text'] = X_train_raw
    class_names = {
        0: 'toxic',
        1: 'severe_toxic',
        2: 'obscene',
        3: 'threat',
        4: 'insult',
        5: 'identity_hate'
    }
    for (id, class_name) in class_names.items():
        submission[class_name] = y_train_predict[:, id]
    submission.to_csv(os.path.join(model_dir, 'predict-keras-train.csv'),
                      index=True)
    print "- AUC: ", roc_auc_score(y_train, y_train_predict)
    print "Finish train set prediction"

    y_dev_predict = model.predict(X_dev,
                                  batch_size=INFERENCE_BATCH_SIZE,
                                  verbose=2)
    submission = pd.DataFrame.from_dict({'id': df_dev['id']})
    submission['comment_text'] = X_dev_raw
    class_names = {
        0: 'toxic',
        1: 'severe_toxic',
        2: 'obscene',
        3: 'threat',
        4: 'insult',
        5: 'identity_hate'
    }
    for (id, class_name) in class_names.items():
        submission[class_name] = y_dev_predict[:, id]
    submission.to_csv(os.path.join(model_dir, 'predict-keras-dev.csv'),
                      index=True)
    print "- AUC: ", roc_auc_score(y_dev, y_dev_predict)
    print "Finish dev set prediction"

    y_test_predict = model.predict(X_test,
                                   batch_size=INFERENCE_BATCH_SIZE,
                                   verbose=2)
    submission = pd.DataFrame.from_dict({'id': df_test['id']})
    class_names = {
        0: 'toxic',
        1: 'severe_toxic',
        2: 'obscene',
        3: 'threat',
        4: 'insult',
        5: 'identity_hate'
    }
    for (id, class_name) in class_names.items():
        submission[class_name] = y_test_predict[:, id]
    submission.to_csv(os.path.join(model_dir, 'submit.csv'), index=False)
    print "Finish test set prediction"

    return 0
Beispiel #23
0
x_train = train_df[TEXT_COLUMN].astype(str)
y_train = train_df[TARGET_COLUMN].values

y_aux_train = train_df[AUX_COLUMNS].values
x_test = test_df[TEXT_COLUMN].astype(str)



# =============================================================================
# Data processing 
# =============================================================================
for column in IDENTITY_COLUMNS + [TARGET_COLUMN]:
    train_df[column] = np.where(train_df[column] >= 0.5, True, False)

tokenizer = text.Tokenizer(filters=CHARS_TO_REMOVE)
tokenizer.fit_on_texts(list(x_train) + list(x_test))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

print("---- Weights")
sample_weights = np.ones(len(x_train), dtype=np.float32)
sample_weights += train_df[IDENTITY_COLUMNS].sum(axis=1)
sample_weights += train_df[TARGET_COLUMN] * (~train_df[IDENTITY_COLUMNS]).sum(axis=1)
sample_weights += (~train_df[TARGET_COLUMN]) * train_df[IDENTITY_COLUMNS].sum(axis=1) * 5
sample_weights /= sample_weights.mean()
Beispiel #24
0
raw_data = raw_data.as_matrix()
y_train = raw_data[:, 0].tolist()
num = len(y_train)
x_train = raw_data[:, 1].tolist()

#read test data
x_test = [
    line.rstrip('\n')
    for line in open('testing_data.txt', 'r', encoding='UTF-8')
]
x_test = [line.split(',', 1)[1] for line in x_test]
del x_test[0]
#x_unlabeled = [line.rstrip('\n') for line in open('training_nolabel.txt', 'r', encoding='UTF-8')]

#tokenized
t = text.Tokenizer(num_words=max_word_idx, filters='\t\n')
t.fit_on_texts(x_train + x_test)
np.save('t.npy', t)
x_train = t.texts_to_sequences(x_train)
#x_unlabeled = t.texts_to_sequences(x_unlabeled)
x_test = t.texts_to_sequences(x_test)

#preprocess
x_train = sequence.pad_sequences(x_train, maxlen=max_sequence_len)
#x_unlabeled = sequence.pad_sequences(x_unlabeled, maxlen = max_sequence_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_sequence_len)

x_train = np.asarray(x_train)
y_train = np.asarray(y_train).reshape(-1, 1)
#x_unlabeled = np.asarray(x_unlabeled)
x_test = np.asarray(x_test)
Beispiel #25
0
wandb.init()
config = wandb.config

# set parameters:
config.vocab_size = 1000
config.maxlen = 300
config.batch_size = 32
config.embedding_dims = 50
config.filters = 250
config.kernel_size = 3
config.hidden_dims = 100
config.epochs = 10

(X_train, y_train), (X_test, y_test) = imdb.load_imdb()

tokenizer = text.Tokenizer(num_words=config.vocab_size)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_matrix(X_train)
X_test = tokenizer.texts_to_matrix(X_test)

X_train = sequence.pad_sequences(X_train, maxlen=config.maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=config.maxlen)

embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
Beispiel #26
0
        lambda x: " ".join(i for i in jieba.cut(x)))
    fw = open(save_feature, 'wb')
    pickle.dump(df, fw)
    fw.close()
else:
    print("特征存在,直接加载...")
    fw = open(save_feature, 'rb')
    df = pickle.load(fw)
    fw.close()

###取出第三列的所有行
X_train = df.iloc[:train_row, 3]
X_test = df.iloc[train_row:, 3]

###Tokenizer进行词法分析
tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(list(X_train) + list(X_test))

###转换word下标的向量形式
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

###将序列填充到maxlen长度
x_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
test = sequence.pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

###训练集结果集
y = df_train['COMMLEVEL'].values

###找出lstm权重
word_index = tokenizer.word_index
Beispiel #27
0
        return


data = pd.read_csv("preprocess/train_char.csv")
data["content"] = data.apply(lambda x: eval(x[1]), axis=1)

validation = pd.read_csv("preprocess/validation_char.csv")
validation["content"] = validation.apply(lambda x: eval(x[1]), axis=1)

model_dir = "model_bigru_char/"
maxlen = 1200
max_features = 20000
batch_size = 128
epochs = 15

tokenizer = text.Tokenizer(num_words=None)
tokenizer.fit_on_texts(data["content"].values)

with open('tokenizer_char.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# word_index 是 6W+ 么?
word_index = tokenizer.word_index

# word2vec : 7983 100 word2vec/chars.vector
w2_model = KeyedVectors.load_word2vec_format("word2vec/chars.vector",
                                             binary=True,
                                             encoding='utf8',
                                             unicode_errors='ignore')
embeddings_index = {}
Beispiel #28
0
data = pd.read_csv("stack-overflow.csv")

# Split dataset into 80% training data and 20% test data
EIGHTY_PERCENT = 0.80
train_size = int(len(data) * EIGHTY_PERCENT)
train_posts = data['post'][:train_size]
train_tags = data['tags'][:train_size]
# which means 20% will be test data 
test_posts = data['post'][train_size:]
test_tags = data['tags'][train_size:]

# the vocabulary size for our model - the top 1000 most commonly used words
vocabulary_size = 1000

# Use Kera's Tokenizer class
tokenize = text.Tokenizer(num_words=vocabulary_size)
tokenize.fit_on_texts(train_posts)

# Create the training data from the collection of posts to pass to the model
# Creates a vocabulary_size “bag” array, with 1s indicating the indices 
# where words in a question are present in the vocabulary
x_train = tokenize.texts_to_matrix(train_posts)

# training dataset: 1s and 0s representation of the tokens in the StackOverflow posts data
# 
# [[0. 1. 1. ... 0. 0. 0.]
#  [0. 1. 1. ... 0. 0. 0.]
#  [0. 1. 1. ... 0. 0. 0.]
#  ...
#  [0. 1. 1. ... 0. 0. 0.]
#  [0. 1. 1. ... 0. 0. 1.]
Beispiel #29
0
def senti_preprocess():
    train_ori, test_ori = _read_data('train.pd', 'test.pd')

    train_ori['text_list'] = train_ori['text_list'].apply(
        lambda list: _remove_pattern_2(list))
    test_ori['text_list'] = test_ori['text_list'].apply(
        lambda list: _remove_pattern_2(list))

    # 把数据的随机shuffle的顺序保存下来,后面senti的数据也做同样的shuffle
    train_random = np.random.permutation(len(train_ori))
    test_random = np.random.permutation(len(test_ori))
    train_ori = train_ori.iloc[train_random]  # 手动shuffle
    test_ori = test_ori.iloc[test_random]

    # 将每个用户的推文连起来,主要是为了后面tokenizer.fit_on_texts
    train_text = train_ori['text_list'].apply(lambda list: " ".join(list))
    test_text = test_ori['text_list'].apply(lambda list: " ".join(list))

    # 变换label,构造Y标签数据集
    Y_train = train_ori['label'].apply(lambda gender: 1
                                       if gender == 'male' else 0)
    Y_test = test_ori['label'].apply(lambda gender: 1
                                     if gender == 'male' else 0)

    # fit tokenizer
    tokenizer = text.Tokenizer(num_words=MAX_VOCAB_SIZE)  #词汇表最多单词数
    tokenizer.fit_on_texts(
        list(train_text) + list(test_text)
    )  #Updates internal vocabulary based on a list of texts.

    # 将每个用户的text_list,变成序列,然后变成等长的序列
    train_ori['seq'] = train_ori['text_list'].apply(
        lambda list: tokenizer.texts_to_sequences(list))
    test_ori['seq'] = test_ori['text_list'].apply(
        lambda list: tokenizer.texts_to_sequences(list))
    train_ori['seq'] = train_ori['seq'].apply(
        lambda list: sequence.pad_sequences(list, maxlen=MAXLEN))
    test_ori['seq'] = test_ori['seq'].apply(
        lambda list: sequence.pad_sequences(list, maxlen=MAXLEN))

    # 将等长的序列变为numpy数据
    X_train = np.array(list(train_ori['seq']))
    X_test = np.array(list(test_ori['seq']))

    print(X_train.shape)
    print(X_test.shape)

    #################### 下面准备senti的数据
    senti = pd.read_pickle(
        os.path.join(os.path.dirname(__file__), '..', 'output',
                     'senti_ori.pd'))
    senti['text'] = _remove_pattern_2(list(senti['text']))

    senti = senti.iloc[np.random.permutation(len(senti))]  # 手动shuffle

    senti_text_list = list(senti['text'])
    senti_tokenizer = text.Tokenizer(num_words=MAX_VOCAB_SIZE)  # 词汇表最多单词数
    senti_tokenizer.fit_on_texts(
        senti_text_list + list(train_text) + list(test_text)
    )  # Updates internal vocabulary based on a list of texts.

    senti_text_seq = senti_tokenizer.texts_to_sequences(
        senti_text_list
    )  # Transforms each text in texts in a sequence of integers
    senti_train_seq = senti_tokenizer.texts_to_sequences(list(train_text))
    senti_test_seq = senti_tokenizer.texts_to_sequences(list(test_text))

    X_senti = sequence.pad_sequences(
        senti_text_seq, maxlen=LONG_MAXLEN
    )  # Pads each sequence to the same length (length of the longest sequence)
    X_senti = np.array(list(X_senti))
    Y_senti = senti['label']
    print(X_senti.shape)
    print(Y_senti.shape)

    senti_train = sequence.pad_sequences(senti_train_seq, maxlen=LONG_MAXLEN)
    senti_test = sequence.pad_sequences(senti_test_seq, maxlen=LONG_MAXLEN)

    senti_train = np.array(list(senti_train))
    senti_test = np.array(list(senti_test))
    print(senti_train.shape)
    print(senti_test.shape)

    return X_train, Y_train, X_test, Y_test, tokenizer, X_senti, Y_senti, senti_train, senti_test
Beispiel #30
0
    stop_words = stopwords.words("english")

    def lemmatize(x):
        lemmatized = []
        for post in x:
            temp = post.lower()
            for type_ in types:
                temp = temp.replace(' ' + type_, '')
            temp = ' '.join([
                lemmatizer.lemmatize(word) for word in temp.split(' ')
                if (word not in stop_words)
            ])
            lemmatized.append(temp)
        return np.array(lemmatized)

    tokenizer = text.Tokenizer(num_words=TOP_WORDS, split=' ')
    tokenizer.fit_on_texts(lemmatize(x_train))

    def preprocess(x):
        lemmatized = lemmatize(x)
        tokenized = tokenizer.texts_to_sequences(lemmatized)
        return sequence.pad_sequences(tokenized, maxlen=MAX_POST_LENGTH)

    ### Assign to dataframe and shuffle rows
    df = pd.DataFrame(data={'x': x_train, 'y': y_train})
    df = df.sample(frac=1).reset_index(drop=True)  ### Shuffle rows
    if SAMPLE:
        df = df.head(10000)  ### Small sample for quick runs

    ### Load glove into memory for embedding
    embeddings_index = dict()