def analysis(): """ Taking search query into the variable 'key'. """ key = request.form['InputText'] """ Performing authentication to access twitter's data. (Use twitter developer credentials below and uncomment the following piece commented code). """ """ consumer_key = '' consumer_secret = '' access_token = '' access_token_secret = '' auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) """ """ Creating an api object using tweepy. """ api = tweepy.API(auth) """ Fetching tweets and storing them in results array. 'num' variable denotes the number of tweets to be fetched. """ results = [] num = 50 for tweet in tweepy.Cursor(api.search, q=key, lang="en").items(num): results.append(tweet) """ Creating a pandas dataframe to capture tweet information. """ dataset = pd.DataFrame() dataset["tweet_id"] = pd.Series([tweet.id for tweet in results]) dataset["username"] = pd.Series( [tweet.author.screen_name for tweet in results]) dataset["text"] = pd.Series([tweet.text for tweet in results]) dataset["followers"] = pd.Series( [tweet.author.followers_count for tweet in results]) dataset["hashtags"] = pd.Series( [tweet.entities.get('hashtags') for tweet in results]) dataset["emojis"] = pd.Series([ ','.join(c for c in tweet.text if c in emoji.UNICODE_EMOJI) for tweet in results ]) """ Following piece of code is used to generate wordcloud of the hashtags used in fetched tweets """ Hashtag_df = pd.DataFrame(columns=["Hashtag"]) j = 0 for tweet in range(0, len(results)): hashtag = results[tweet].entities.get('hashtags') for i in range(0, len(hashtag)): Htag = hashtag[i]['text'] Hashtag_df.at[j, 'Hashtag'] = Htag j = j + 1 Hashtag_Combined = " ".join(Hashtag_df['Hashtag'].values.astype(str)) text = " ".join(dataset['text'].values.astype(str)) cleaned_text = " ".join([ word for word in text.split() if word != "https" and word != "RT" and word != "co" ]) wc = WordCloud(width=500, height=500, background_color="white", stopwords=STOPWORDS).generate(Hashtag_Combined) plt.imshow(wc) plt.axis("off") r = random.randint(1, 101) st = 'static\hashtag' + str(r) + '.png' plt.savefig(st, dpi=300) """ Following piece of code is used to get a list of top 5 hashtags """ hashtag = Hashtag_Combined.split(" ") df = pd.DataFrame() df['hashtags'] = pd.Series([i for i in hashtag]) data = df['hashtags'].value_counts() tag_count_list = data.values[:5] tag_list = data.keys()[:5] """ Following piece of code generates tokens using training set. """ x = np.load('../Classification network/X_train.npy', allow_pickle=True) tk = Tokenizer(num_words=80000) tk.fit_on_texts(x) """ Following piece of code is used to preprocess the fetched tweets. Preprocessing steps : -> Remove links, hashtag symbols and twitter handles. -> Convert text to lowercase. -> Remove punctuations. -> Remove 'rt' from the text. """ sentDataFrame = dataset.copy(deep=True) sentDataFrame['text'] = sentDataFrame['text'].apply(lambda x: ' '.join( re.sub( "(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", x).split())) sentDataFrame["text"] = sentDataFrame["text"].apply(lambda x: x.lower()) sentDataFrame["text"] = sentDataFrame["text"].apply( lambda x: x.translate(str.maketrans('', '', string.punctuation))) sentDataFrame["text"] = sentDataFrame["text"].apply( lambda x: x.replace('rt', '')) """ Following poece of code is used to generate vector of tokens for each tweet and padding the vectors such that every vector will have a length of 35. """ tweet_tokens = tk.texts_to_sequences(dataset['text'].values) tweet_tokens_pad = pad_sequences(tweet_tokens, maxlen=35, padding='post') """ Following poece of code is used to load the model for sentiment classification. """ json_file = open("../Classification network/model.json", 'r') loaded_model_json = json_file.read() json_file.close() model = model_from_json(loaded_model_json) model.load_weights("../Classification network/model.h5") """ Performing predictions using the model """ senModelList = model.predict(x=tweet_tokens_pad) em = dataset["emojis"].values """ Following piece of code stores sentiment score for each emoji in a dictionary 'dict'. """ df_emojis = pd.read_csv('emoji_sentiment.csv') x = df_emojis['emoji'].values y = df_emojis['sentiment_score'].values dict = {} for i in range(len(x)): dict[x[i]] = y[i] """ Following piece of code performs averaging between the sentiment score obtained from the model and the sentiment score of the emojis present in the model. """ for q in range(len(em)): if (em[q] != 'nan'): em_sent_score = 0 sc_list = em[q].split(",") emj_count = 0 for emj in sc_list: if emj in dict.keys(): em_sent_score = em_sent_score + dict[emj] emj_count += 1 if (emj_count > 0): senModelList[q] = (( (em_sent_score / emj_count) + 1) / 2 + senModelList[q]) / 2 """ Following piece of code classifies the sentiment of the tweet and stores it in the datafrane 'dataset'. """ senList = [] for i in range(num): if (senModelList[i] <= 0.5): senList.append('n') else: senList.append('p') dataset['sentiment'] = pd.Series(senList) """ Following piece of code stores the sum of positively classified tweets in 'posSentPer' and negatively classified tweets in 'negSentPe' which are than stored in the list 'opList'. """ posSentPer = len(dataset[dataset['sentiment'] == 'p'].sentiment) negSentPer = len(dataset[dataset['sentiment'] == 'n'].sentiment) opList = [posSentPer, negSentPer] """ Following piece of code stores the positive visibility score in 'posVis' and negative visibility score in 'negVis' which are than combinely stored in the list 'vbList'. """ pos_dataset_for_visibility = dataset[dataset['sentiment'] == 'p'] posVis = pos_dataset_for_visibility['followers'].sum(axis=0, skipna=True) neg_dataset_for_visibility = dataset[dataset['sentiment'] == 'n'] negVis = neg_dataset_for_visibility['followers'].sum(axis=0, skipna=True) vbList = [posVis, negVis] """ Following piece of code stores the tweets in 'tw_text', username of tweets in 'tw_uname' and number of followers of the authors of tweets in 'tw_foll'. """ tw_uname = dataset['username'].values.tolist() tw_text = dataset['text'].values.tolist() tw_foll = dataset['followers'].values.tolist() return render_template('analysis.html', title='analysis', vbList=vbList, key=key, r=r, tag_list=tag_list, opList=opList, tag_count_list=tag_count_list, tw_uname=tw_uname, tw_text=tw_text, tw_foll=tw_foll)
for i in train_sentences: tokens = word_tokenize(i) # word tokenizacia words.append(tokens) # pridá výsledné slová do prázdneho listu, ktorý sme na začiatku vytvorili print('Word2Vec...') from gensim.models import Word2Vec from keras.layers import Embedding model = Word2Vec(words, min_count = 1) #word to vector, implementacia z kniznice gensim vocabulary = model.wv.vocab #slovnik name = 'w2v.txt' model.wv.save_word2vec_format(name, binary = False) EMBEDDING_FILE = 'w2v.txt' # load embeddings tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(train_sentences)) tokenized_train_sentences = tokenizer.texts_to_sequences(train_sentences) tokenized_test_sentences = tokenizer.texts_to_sequences(test_sentences) train_padding = pad_sequences(tokenized_train_sentences, maxlen) test_padding = pad_sequences(tokenized_test_sentences, maxlen) def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32') embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding='utf8')) word_index = tokenizer.word_index nb_words = min(max_features, len(word_index)) embedding_matrix = np.zeros((nb_words, embed_size)) for word, i in word_index.items():
y_train = df.loc[:24999, 'sentiment'].values X_test = df.loc[25000:, 'review'].values y_test = df.loc[25000:, 'sentiment'].values X = np.concatenate((X_train, X_test), axis=0) y = np.concatenate((y_train, y_test), axis=0) # summarize size print("Training data: ") print(X.shape) print(y.shape) from tensorflow.python.keras.preprocessing.text import Tokenizer from tensorflow.python.keras.preprocessing.sequence import pad_sequences tokenizer_obj = Tokenizer() total_reviews = X_train + X_test tokenizer_obj.fit_on_texts(total_reviews) # pad sequences max_length = max([len(s.split()) for s in total_reviews]) #max_length=500 # define vocabulary size vocab_size = len(tokenizer_obj.word_index) + 1 X_train_tokens = tokenizer_obj.texts_to_sequences(X_train) X_test_tokens = tokenizer_obj.texts_to_sequences(X_test) X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='pre') X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='pre')
text13 = "eeek" text14 = "already got tix to watch it again" text15 = "really awful" text16 = "not bad" x_train_text = [text1, text2, text3, text4, text5, text6, text7, text8] y_train = [1, 1, 0, 0, 0, 0, 0, 0] x_test_text = [ text8, text9, text10, text11, text12, text13, text14, text15, text16 ] y_test = [0, 0, 1, 1, 0, 0, 1, 0, 1] texts = x_train_text + x_test_text num_words = 100 # use only 100 most popular words from the dataset max_tokens = 10 # truncate individual texts to only 10 words # We first convert these texts to arrays of integer-tokens because that is needed by the model tokenizer = Tokenizer(num_words=num_words) tokenizer.fit_on_texts(x_train_text) x_train_tokens = tokenizer.texts_to_sequences(x_train_text) x_test_tokens = tokenizer.texts_to_sequences(x_test_text) tokenizer.word_index # inspect vocabulary np.array(x_train_tokens[1]) # how text 2 has been tokenized # To input texts with different lengths into the model, we also need to pad and truncate them num_tokens = np.array([len(i) for i in x_train_tokens]) np.sum(num_tokens < max_tokens) / len( num_tokens) # check text covered after truncating pad = 'pre' # better to add 0 at beginning of sequence x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens, padding=pad, truncating='pre')
j += 1 if max < len(words): max = len(words) reviews = new_review world_net = reviews rating = new_rating print("The total number of reviews after modifying",len(reviews)) vectorizer = CountVectorizer() X = vectorizer.fit_transform(reviews) print("The total number of different words from dataset's dictionary",X.toarray().shape[1]) # using token to modify text into array index # finding adjectives from tokenizer based on world_net dictionary. find_sentiment = False if find_sentiment: t = Tokenizer() t.fit_on_texts(world_net) dic_tokenizer = t.word_counts sentiment = dict() for word in dic_tokenizer: open = os.popen("C:\\Users\\billpham\\Desktop\\WordNet\\bin\\wn "+word+" -n# -searchtype -over").read() if "adj" in open: sentiment[word] = 1 new_data = open("new_data.txt","r") for x in dic: new_data.write(str(x)) exit() # create the adjective data file generate_data_file = False if generate_data_file: new_data = open("new_data.txt","r").readlines()
num_validation_samples = int(Validation_split * np.asarray(processed_features).shape[0]) x_train = processed_features[:-num_validation_samples] x_test = processed_features[-num_validation_samples:] Y_train = lb_make.fit_transform(labels[:-num_validation_samples]) Y_test = lb_make.fit_transform(labels[-num_validation_samples:]) # In[9]: #Learning Embeddings # Tokenizing sentences using Keras from tensorflow.python.keras.preprocessing.text import Tokenizer from tensorflow.python.keras.preprocessing.sequence import pad_sequences tokenizer_obj = Tokenizer() tokenizer_obj.fit_on_texts(processed_feature) #Determining number of words in largest sentence and vocabulary size of the corpus max_length = max([len(sentence.split()) for sentence in processed_feature]) vocab_size = len(tokenizer_obj.word_index) + 1 # In[10]: #Converting sentences in train and test sets to sequences x_train_tokens = tokenizer_obj.texts_to_sequences(x_train) x_test_tokens = tokenizer_obj.texts_to_sequences(x_test) #Padding to make sure all the vectors are the same length x_train_pad = pad_sequences(x_train_tokens, maxlen=max_length, padding='post')
print('Found %s word vectors.' % len(embeddings_index)) #PARAMETERS MAX_NB_WORDS = 20000 MAX_SEQUENCE_LENGTH = 200 EMBEDDING_DIM = 100 output_size = n_labels batch_size = 64 nb_epochs = 1000 ## Creating the data for a keras neural network and starting the embedding. comments = df_train.text.tolist() + df_test.text.tolist( ) + df_valid.text.tolist() tokenizer = Tokenizer(num_words=MAX_NB_WORDS, char_level=False) tokenizer.fit_on_texts(comments) word_index = tokenizer.word_index sequences_train = tokenizer.texts_to_sequences(df_train.text) sequences_test = tokenizer.texts_to_sequences(df_test.text) sequences_valid = tokenizer.texts_to_sequences(df_valid.text) x_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH) x_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH) x_valid = pad_sequences(sequences_valid, maxlen=MAX_SEQUENCE_LENGTH) y_train = df_train.drop(df_train.columns[[0]], axis=1).values y_test = df_test.drop(df_test.columns[[0]], axis=1).values y_valid = df_test.drop(df_test.columns[[0]], axis=1).values
def main(): if len(sys.argv) != 4: sys.exit("Usage: python3 train_hw4.py $1 $2 $3") # read data x_train = pd.read_csv(sys.argv[1]).values y_train = pd.read_csv(sys.argv[2]).values x_test = pd.read_csv(sys.argv[3]).values nlp = en_core_web_sm.load() sentences = x_train[:,1] for i in range(len(sentences)): doc = nlp(sentences[i]) sentences[i] = [t.text for t in doc] max_length = max([len(sentence) for sentence in sentences]) vocab_size = len(sentences) EMBEDDING_DIM = 400 # Train & Save Word2Vec model model = Word2Vec(min_count=1, size = EMBEDDING_DIM, workers=4) model.build_vocab(sentences) # prepare the model vocabulary model.train(sentences, total_examples=model.corpus_count, epochs=20) # train word vectors # save model filename = 'word2vec_{}.txt'.format(TIME) model.wv.save_word2vec_format(filename, binary=False) # read the model embeddings_index = {} f = open(os.path.join('', filename), encoding='utf-8') for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:]) embeddings_index[word] = coefs f.close x_train = list(x_train[:,1]) # Train & Save Tokenizer tokenizer_obj = Tokenizer() tokenizer_obj.fit_on_texts(x_train) # save model pickle_name = 'tokenizer_{}.txt'.format(TIME) with open(pickle_name, 'wb') as handle: pickle.dump(tokenizer_obj, handle, protocol=pickle.HIGHEST_PROTOCOL) # loading with open(pickle_name, 'rb') as handle: tokenizer_obj = pickle.load(handle) sequences = tokenizer_obj.texts_to_sequences(x_train) word_index = tokenizer_obj.word_index review_pad = pad_sequences(sequences, maxlen=max_length) target = y_train[:,1] num_words = len(word_index) + 1 embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) for word, i in word_index.items(): if i > num_words: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector # split to training and testing data VALIDATION_SPLIT = 0.01 indices = np.arange(review_pad.shape[0]) np.random.shuffle(indices) review_pad = review_pad[indices] target = target[indices] num_validation_samples = int(VALIDATION_SPLIT * review_pad.shape[0]) x_train_pad = review_pad[:-num_validation_samples] y_train = target[:-num_validation_samples] x_test_pad = review_pad[-num_validation_samples:] y_test = target[-num_validation_samples:] # model1 - GRU model= Sequential() embedding_layer = Embedding(num_words, EMBEDDING_DIM, embeddings_initializer=Constant(embedding_matrix), input_length=max_length, trainable=False) model.add(embedding_layer) model.add(GRU(units=50, dropout=0.5, recurrent_dropout=0.5,return_sequences=True)) model.add(GRU(units=50, dropout=0.5, recurrent_dropout=0.5)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x_train_pad, y_train, batch_size=128, epochs=1, validation_data=(x_test_pad, y_test), verbose=2, shuffle=True) model.save('model_{}.h5'.format(TIME)) # model2: Bi-GRU model= Sequential() embedding_layer = Embedding(num_words, EMBEDDING_DIM, embeddings_initializer=Constant(embedding_matrix), input_length=max_length, trainable=False) model.add(embedding_layer) model.add(Bidirectional(GRU(units=40, dropout=0.4, recurrent_dropout=0.4,return_sequences=True))) model.add(Bidirectional(GRU(units=40, dropout=0.4, recurrent_dropout=0.4))) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x_train_pad, y_train, batch_size=128, epochs=1, validation_data=(x_test_pad, y_test), verbose=2, shuffle=True) model.save('model_{}.h5'.format(TIME)) # model3: Bi-GRU model= Sequential() embedding_layer = Embedding(num_words, EMBEDDING_DIM, embeddings_initializer=Constant(embedding_matrix), input_length=max_length, trainable=False) model.add(embedding_layer) model.add(Bidirectional(GRU(units=30, dropout=0.5, recurrent_dropout=0.5,return_sequences=True))) model.add(Bidirectional(GRU(units=30, dropout=0.5, recurrent_dropout=0.5))) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x_train_pad, y_train, batch_size=128, epochs=1, validation_data=(x_test_pad, y_test), verbose=2, shuffle=True) model.save('model_{}.h5'.format(TIME)) print("TRAINING COMPLETED")
# In[15]: X,mid=text_clense_frame(X) # Tokenization of words # In[16]: num_words=mid #Tokenize the text tokenize=Tokenizer(num_words=num_words) tokenize.fit_on_texts(X) idx=tokenize.word_index x_train_token=tokenize.texts_to_sequences(X) #x_test_token=tokenize.texts_to_sequences(X_test) # In[18]: num_tokens=[len(token) for token in x_train_token] num_tokens=np.array(num_tokens) max_tokens=np.mean(num_tokens)+2*np.std(num_tokens) max_tokens=int(max_tokens) print("Max Tokens") print(max_tokens)
url = "final_english_data_tagged.txt" names = ['data', 'class'] df = pd.read_csv(url, names=names, delimiter='\t') train_text, test_text, train_y, test_y = train_test_split(df['data'], df['class'], test_size=0.2) # The maximum number of words to be used. (most frequent) MAX_NB_WORDS = 50000 # Max number of words in each complaint. MAX_SEQUENCE_LENGTH = 1300 # This is fixed. EMBEDDING_DIM = 300 tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True) tokenizer.fit_on_texts(df['data'].values) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) X = tokenizer.texts_to_sequences(df['data'].values) X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH) print('Shape of data tensor:', X.shape) Y = pd.get_dummies(df['class']).values print('Shape of label tensor:', Y.shape) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
embed_size = 50 # max number of unique words max_features = 2000 # max number of words from review to use maxlen = 50 embedding_file = "Models/glove.6B.50d.txt" def coefs(word, *arr): return word, np.asarray(arr, dtype='float32') embeddings_index = dict( coefs(*f.strip().split()) for f in open(embedding_file, mode="r", encoding="utf-8")) tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(train['Reviews'].values)) X_train = tokenizer.texts_to_sequences(train['Reviews'].values) X_test = tokenizer.texts_to_sequences(test['Reviews'].values) x_train = pad_sequences(X_train, maxlen=maxlen) x_test = pad_sequences(X_test, maxlen=maxlen) word_index = tokenizer.word_index nb_words = min(max_features, len(word_index)) embedding_matrix = np.zeros((nb_words, embed_size)) for word, i in word_index.items(): if i >= max_features: break
def main(): MAX_LENGTH = 50 MAX_NB_WORDS = 250000 embed_size = 300 # german Data = read_data('DeTrainingData_3') # english # Data = read_data('EnTrainingData_3') X_train, X_val, y_train, y_val = train_test_split( Data['Title'], Data['Class'], test_size=0.1, random_state=42) all_embs, emb_mean, emb_std, embeddings_index = load_Embeddings( MAX_LENGTH=50, MAX_NB_WORDS=250000) tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(Data['Title']) # german joblib.dump(tokenizer, './Models_test/tok_DE_3.pickle') # english # joblib.dump(tokenizer, './Models_test/tok_EN_3.pickle') # to numerical vectors padded_train_sequences = Text_to_Sequence(X_train, tokenizer, MAX_LENGTH) padded_val_sequences = Text_to_Sequence(X_val, tokenizer, MAX_LENGTH) # -------------------------------------------------------------------- """ - The data is first fed into the first-level model and predictions are acquired. - Then the same data is concatenated with the acquired predictions (as metadata) and are fed into the second level model. - Finally the original data is again concatenated with both predictions from both first and second models and fed into the network """ # Predictions by first-level model # german model_lev_1 = load_model("./Models_test/DE_1_0.67.hdf5") # english # model_lev_1 = load_model("./Models_test/EN_1_0.77.hdf5") level_1_num_train = np.argmax(model_lev_1.predict( padded_train_sequences, batch_size=256), axis=1) level_1_num_val = np.argmax(model_lev_1.predict( padded_val_sequences, batch_size=256), axis=1) # Predictions by second-level model with first level predictions as metadata # german model_lev_2 = load_model("./Models/DE_2_0.53.hdf5") # english # model_lev_2 = load_model("./Models/EN_2_0.65.hdf5") level_2_num_train = np.argmax(model_lev_2.predict( [padded_train_sequences, np.float32(level_1_num_train)], batch_size=256), axis=1) level_2_num_val = np.argmax(model_lev_2.predict( [padded_val_sequences, np.float32(level_1_num_val)], batch_size=256), axis=1) # ----------------------------------------------------------------------------- # get embedding matrix (is fed into the first layer => embedding layer) word_index = tokenizer.word_index embedding_matrix_2 = Create_EmbeddingMatrix( emb_mean, emb_std, embed_size, MAX_NB_WORDS, word_index, embeddings_index) # ----------------------------------------------------------------------------- # compile the network rnn_model = get_rnn_model(MAX_NB_WORDS, embedding_matrix_2) print(rnn_model.summary()) # ---------------------------------------------------------------------------------- # take care of inbalance class_weights = class_weight.compute_class_weight( 'balanced', np.unique(y_train), y_train) # ------------------------------------------------------------------------------------------- #### Converting Classes into label Encoding ###### # encode class values as integers encoder = LabelEncoder() encoder.fit(y_train) joblib.dump(encoder, './Models/encoder_DE_3.pickle') # convert integers to one-hot encoding dummy_y_train = Encoder(y_train, encoder) dummy_y_val = Encoder(y_val, encoder) # ---------------------------------------------------------------------------------------- # callbacks lr_scheduler = ReduceLROnPlateau( monitor='val_loss', factor=0.1, patience=3, verbose=1, mode='auto', cooldown=0, min_lr=0.0000001) es = EarlyStopping(monitor='val_loss', min_delta=0.00005, patience=5, verbose=0, mode='auto') # save checkpoint filepath = "./Models_test/weights-improvement-{epoch:02d}-{val_accuracy:.2f}.hdf5" checkpoint = ModelCheckpoint( filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max') # ----------------------------------------------------------------------------------------------- # run the network """ train and validation sets = [data, predictions from the first model, predictions from the second model] """ history = rnn_model.fit([padded_train_sequences, level_1_num_train, level_2_num_train], dummy_y_train, validation_data=( [padded_val_sequences, level_1_num_val, level_2_num_val], dummy_y_val), batch_size=256, callbacks=[lr_scheduler, es, checkpoint], class_weight=class_weights, epochs=50) # summarize history for accuracy plt.plot(history.history['accuracy']) plt.plot(history.history['val_accuracy']) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') # Saving Accuracy Figure plt.savefig('DE_3_Accuracy.png') plt.show() # summarize history for loss plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') # Saving Loss Figure plt.savefig('DE_3_Loss.png') plt.show()
def load_data(target): all_article = [] train_artic = [] train_bound = [] train_abstr = [] train_extra = [] # Load all data with open('./data/train.jsonl') as j: for each in j: each = json.loads(each) if each['text'] == "\n": # print(each['id']) continue train_artic += [each['text']] train_bound += [each['sent_bounds']] train_abstr += [each['summary']] train_extra += [each['extractive_summary']] j.close() valid_artic = [] valid_bound = [] valid_abstr = [] valid_extra = [] with open('./data/valid.jsonl') as j: for each in j: each = json.loads(each) if each['text'] == "\n": print(each['id']) valid_artic += [each['text']] valid_bound += [each['sent_bounds']] valid_abstr += [each['summary']] valid_extra += [each['extractive_summary']] j.close() test_artic = [] test_bound = [] with open('./data/test.jsonl') as j: for each in j: each = json.loads(each) if each['text'] == "\n": print('null article in test data:', each['id']) test_artic += [each['text']] test_bound += [each['sent_bounds']] j.close() if target == 'extractive': # Parsing article # Split each article into lists X_train = text_parsing(train_artic, train_bound) X_valid = text_parsing(valid_artic, valid_bound) X_test = text_parsing(test_artic, test_bound) # Data cleaning print('Data cleaning.') X_all_text = [] X_train_seq = text_cleaning(X_train, X_all_text) X_valid_seq = text_cleaning(X_valid, []) X_test_seq = text_cleaning(X_test, []) # Load pretrained MAX_LEN = 100 MAX_NUM_WORDS = 20000 EMBEDDING_DIM = 300 # finally, vectorize the text samples into a 2D integer tensor print('Tokenization') tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) # Fit the tokenizer on our text tokenizer.fit_on_texts(X_all_text) # Get all words that the tokenizer knows word_index = tokenizer.word_index num_words = len(word_index) + 1 print('Found %s unique tokens' % len(word_index)) print('text to sequence vector') train_sentence_length = [] valid_sentence_length = [] test_sentence_length = [] X_train_seq = text_to_sequence(X_train, train_sentence_length, tokenizer) X_valid_seq = text_to_sequence(X_valid, valid_sentence_length, tokenizer) X_test_seq = text_to_sequence(X_test, test_sentence_length, tokenizer) print('Making Label...') def make_label(X, extractive): label = [] for i in range(len(X)): temp = [] extrac_index = extractive[i] for j in range(len(X[i])): if j == extrac_index: temp += [1] * len(X[i][j]) else: temp += [0] * len(X[i][j]) label.append(temp) return label Y_train = make_label(X_train, train_extra) Y_valid = make_label(X_valid, valid_extra) X_train = text_to_one_list(X_train) X_valid = text_to_one_list(X_valid) X_test = text_to_one_list(X_test) X_train = pad_sequences( X_train, maxlen=MAX_LEN, truncating='post', padding='post' ) # Each sentence is padding to a size=max_len vector X_valid = pad_sequences(X_valid, maxlen=MAX_LEN, truncating='post', padding='post') X_test = pad_sequences(X_test, maxlen=MAX_LEN, truncating='post', padding='post') Y_train = pad_sequences(Y_train, maxlen=MAX_LEN, truncating='post', padding='post') Y_valid = pad_sequences(Y_valid, maxlen=MAX_LEN, truncating='post', padding='post') # Preparing embedding matrix num_words = min(MAX_NUM_WORDS, len(word_index) + 1) np.save('./data/extractive_label.npy', Y_train) return if target == 'abstractive': # Parsing article into pieces of sentence X_train = text_parsing(train_artic, train_bound) X_valid = text_parsing(valid_artic, valid_bound) X_test = text_parsing(test_artic, test_bound) # Data cleaning # remove punctuation, ... print('Data cleaning.') X_all_text = [] X_train = text_cleaning(X_train, X_all_text) X_valid = text_cleaning(X_valid, []) X_test = text_cleaning(X_test, []) # prepare label text print('Preparing label text') for i in tqdm(range(len(train_abstr))): train_abstr[i] = text_cleaner(train_abstr[i]) for j in tqdm(range(len(valid_abstr))): valid_abstr[j] = text_cleaner(valid_abstr[j]) print('Add start and end tagger to the labels') def add_tagger(abstractive, all_text): for i in tqdm(range(len(abstractive))): abstractive[i] = '_BOS_ ' + abstractive[i] + ' _EOS_' all_text.append(abstractive[i]) return abstractive Y_all_text = [] train_abstr = add_tagger(train_abstr, Y_all_text) valid_abstr = add_tagger(valid_abstr, []) MAX_LEN = 100 MAX_SUMMARY_LEN = 30 MAX_NUM_WORDS = 20000 EMBEDDING_DIM = 300 # finally, vectorize the text samples into a 2D integer tensor print('Tokenization') X_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) # Fit the tokenizer on training text X_tokenizer.fit_on_texts(X_all_text) # Get all words that the tokenizer knows X_word_index = X_tokenizer.word_index num_words = min(len(X_word_index) + 1, MAX_NUM_WORDS) print('Found %s unique tokens' % len(X_word_index)) print('text to sequence vector') X_train = text_to_sequence(X_train, [], X_tokenizer) X_valid = text_to_sequence(X_valid, [], X_tokenizer) X_test = text_to_sequence(X_test, [], X_tokenizer) X_train = text_to_one_list(X_train) X_valid = text_to_one_list(X_valid) X_test = text_to_one_list(X_test) Y_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) Y_tokenizer.fit_on_texts(train_abstr) Y_word_index = Y_tokenizer.word_index # Y tokenization only trained on train data label Y_train = Y_tokenizer.texts_to_sequences(train_abstr) Y_valid = Y_tokenizer.texts_to_sequences(valid_abstr) X_train = pad_sequences( X_train, maxlen=MAX_LEN, truncating='post', padding='post' ) # Each sentence is padding to a size=max_len vector X_valid = pad_sequences(X_valid, maxlen=MAX_LEN, truncating='post', padding='post') X_test = pad_sequences(X_test, maxlen=MAX_LEN, truncating='post', padding='post') Y_train = pad_sequences(Y_train, maxlen=MAX_SUMMARY_LEN, truncating='post', padding='post') Y_valid = pad_sequences(Y_valid, maxlen=MAX_SUMMARY_LEN, truncating='post', padding='post') # Load pretrained # Prepare pre-trained embedding matrix np.save('./data/train.npy', X_train) np.save('./data/abstractive_label.npy', Y_train) with open('./data/X_tokenizer.pkl', 'wb') as file: pickle.dump(X_tokenizer, file) file.close() with open('./data/Y_tokenizer.pkl', 'wb') as file: pickle.dump(Y_tokenizer, file) return
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy']) return model def get_layer2(embedding): return 0 if __name__ == '__main__': x_data, y_data = load_data() corpus = [sentence_seg(x) for x in x_data] tokenizer = Tokenizer() tokenizer.fit_on_texts(corpus) sequences = tokenizer.texts_to_sequences(corpus) max_sequence_len = max([len(s) for s in sequences]) print("max sequence lenght:", max_sequence_len) data = pad_sequences(sequences, maxlen=max_sequence_len) labels = to_categorical(np.asarray(y_data)) X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3) ##########################################################
def build_model(use_gpu: bool = False, num_units: int = 64, num_layers: int = 1, dropout_rate: float = 0.0, batch_size: int = 1000, window_size: int = 10, num_params: int = 0): """ Builds the RNN-Model for character prediction. :param window_size: Sequence size :param batch_size: {int} Size of batch :param dropout_rate: {float} Regulating Dropout rate between layers :param num_layers: {int} Number of layers to build :param num_units: {int} Number of LSTM-Units to use in network :param use_gpu: {bool} Uses Tensorflow GPU support if True, otherwise trains on CPU :param num_params: {int} Number of control parameters :return: Keras model """ # Load max 5000 entries from the dataset to build the Tokenizer / vocabulary loader = Loader(min(batch_size, 5000), 0) tokenizer = Tokenizer(filters='', split='°', lower=False) for dataframe in loader: chars = set() for name in dataframe['name']: chars.update(set(str(name))) tokenizer.fit_on_texts(list(chars)) tokenizer.fit_on_texts(['pre', '<end>', 'pad']) # Build Keras Model model = Sequential() for r in range(0, max(num_layers - 1, 0)): model.add(layer=(CuDNNLSTM if use_gpu else LSTM )(num_units, input_shape=(window_size, len(tokenizer.index_word) + 1 + num_params), return_sequences=True)) model.add(Dropout(dropout_rate)) model.add( layer=(CuDNNLSTM if use_gpu else LSTM)(num_units, input_shape=( window_size, len(tokenizer.index_word) + 1 + num_params))) model.add(Dense(len(tokenizer.index_word) + 1, activation='softmax')) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) # Show summary print(model.summary()) return model, tokenizer
class TokenizerWrapper: def __init__(self, dataset_csv_file, class_name, max_caption_length, tokenizer_num_words=None): dataset_df = pd.read_csv(dataset_csv_file) sentences = dataset_df[class_name].tolist() self.max_caption_length = max_caption_length self.tokenizer_num_words = tokenizer_num_words self.init_tokenizer(sentences) def clean_sentence(self, sentence): return text_to_word_sequence( sentence, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ') def init_tokenizer(self, sentences): for i in range(len(sentences)): if pd.isna(sentences[i]): sentences[i] = "" sentences[i] = self.clean_sentence(sentences[i]) # Tokenize the reviews print("Tokenizing dataset..") self.tokenizer = Tokenizer(oov_token='UNK', num_words=self.tokenizer_num_words) self.tokenizer.fit_on_texts(sentences) # give each word a unique id print("number of tokens: {}".format(self.tokenizer.word_index)) print("Tokenizing is complete.") def get_tokenizer_num_words(self): return self.tokenizer_num_words def get_token_of_word(self, word): return self.tokenizer.word_index[word] def get_word_from_token(self, token): try: return self.tokenizer.index_word[token] except: return "" def get_sentence_from_tokens(self, tokens): sentence = [] for token in tokens[0]: word = self.get_word_from_token(token) if word == 'endseq': return sentence if word != 'startseq': sentence.append(word) return sentence def get_string_from_word_list(self, word_list): return " ".join(word_list) def get_word_tokens_list(self): return self.tokenizer.word_index def tokenize_sentences(self, sentences): index = 0 tokenized_sentences = np.zeros( (sentences.shape[0], self.max_caption_length), dtype=int) for caption in sentences: tokenized_caption = self.tokenizer.texts_to_sequences( [self.clean_sentence(caption[0])]) tokenized_sentences[index] = pad_sequences( tokenized_caption, maxlen=self.max_caption_length, padding='post') # padded with max length index = index + 1 return tokenized_sentences
class TokenizerWrapper: def __init__(self, dataset_csv_file, class_name, max_caption_length, tokenizer_num_words=None): dataset_df = pd.read_csv(dataset_csv_file) sentences = dataset_df[class_name].tolist() self.max_caption_length = max_caption_length self.tokenizer_num_words = tokenizer_num_words self.init_tokenizer(sentences) self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', add_prefix_space=True) self.gpt2_tokenizer.pad_token = "<" def clean_sentence(self, sentence): return text_to_word_sequence(sentence, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ') def GPT2_pad_token_id(self): return self.gpt2_tokenizer.pad_token_id def GPT2_eos_token_id(self): return self.gpt2_tokenizer.eos_token_id def GPT2_encode(self, sentences, pad=True, max_length=None): if max_length is None: max_length = self.max_caption_length if isinstance(sentences, str): return self.gpt2_tokenizer.encode(sentences, add_special_tokens=True, max_length=max_length, pad_to_max_length=pad) tokens = np.zeros((sentences.shape[0], max_length), dtype=int) for i in range(len(sentences)): if pd.isna(sentences[i]): sentences[i][0] = "" sentence = sentences[i][0].lower() sentence = sentence.replace('"', '') sentence = sentence.replace('xxxx', '') sentence = sentence.replace('endseq', '<|endoftext|>') tokens[i] = self.gpt2_tokenizer.encode(sentence, add_special_tokens=True, max_length=max_length, pad_to_max_length=pad) return tokens def GPT2_decode(self, tokens): return self.gpt2_tokenizer.decode(tokens, skip_special_tokens=True) def GPT2_format_output(self, sentence): sentence = self.clean_sentence(sentence) return sentence def filter_special_words(self, sentence): sentence = sentence.replace('startseq', '') sentence = sentence.replace('endseq', '') sentence = sentence.replace('<|endoftext|>', '') sentence = sentence.replace('<', '') sentence = sentence.strip() return sentence def init_tokenizer(self, sentences): for i in range(len(sentences)): if pd.isna(sentences[i]): sentences[i] = "" sentences[i] = self.clean_sentence(sentences[i]) # Tokenize the reviews print("Tokenizing dataset..") self.tokenizer = Tokenizer(oov_token='UNK', num_words=self.tokenizer_num_words) self.tokenizer.fit_on_texts(sentences) # give each word a unique id print("number of tokens: {}".format(self.tokenizer.word_index)) print("Tokenizing is complete.") def get_tokenizer_num_words(self): return self.tokenizer_num_words def get_token_of_word(self, word): return self.tokenizer.word_index[word] def get_word_from_token(self, token): try: return self.tokenizer.index_word[token] except: return "" def get_sentence_from_tokens(self, tokens): sentence = [] for token in tokens[0]: word = self.get_word_from_token(token) if word == 'endseq': return sentence if word != 'startseq': sentence.append(word) return sentence def get_string_from_word_list(self, word_list): return " ".join(word_list) def get_word_tokens_list(self): return self.tokenizer.word_index def tokenize_sentences(self, sentences): index = 0 tokenized_sentences = np.zeros((sentences.shape[0], self.max_caption_length), dtype=int) for caption in sentences: tokenized_caption = self.tokenizer.texts_to_sequences([self.clean_sentence(caption[0])]) tokenized_sentences[index] = pad_sequences(tokenized_caption, maxlen=self.max_caption_length, padding='post') # padded with max length index = index + 1 return tokenized_sentences
# In[ ]: # # In[22]: X_train = df.loc[:24999, 'review'].values y_train = df.loc[:24999, 'sentiment'].values X_test = df.loc[25000:, 'review'].values y_test = df.loc[25000:, 'sentiment'].values from tensorflow.python.keras.preprocessing.text import Tokenizer from tensorflow.python.keras.preprocessing.sequence import pad_sequences total_reviews = X_train + X_test tokenizer_obj = Tokenizer() tokenizer_obj.fit_on_texts(total_reviews) max_length = max([len(s.split()) for s in df['review'].values.tolist()]) vocab_size = len(tokenizer_obj.word_index) + 1 X_train_tokens = tokenizer_obj.texts_to_sequences(X_train) X_test_tokens = tokenizer_obj.texts_to_sequences(X_test) X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post') X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post') # In[49]: # In[55]:
def make_model(): # load all training reviews positive_docs = process_docs('data/pos_train', vocab, True) negative_docs = process_docs('data/neg_train', vocab, True) train_docs = negative_docs + positive_docs # create the tokenizer tokenizer = Tokenizer() # fit the tokenizer on the documents tokenizer.fit_on_texts(train_docs) with open('tokenizer.pickle', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) # sequence encode encoded_docs = tokenizer.texts_to_sequences(train_docs) # pad sequences max_length = max([len(s.split()) for s in train_docs]) print("\n\n maxlenght="+str(max_length)) from tensorflow.python.keras.preprocessing.sequence import pad_sequences X = pad_sequences(encoded_docs, maxlen=max_length, padding='post') # define training labels y = np.array([0 for _ in range(270)] + [1 for _ in range(270)]) Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.30, random_state=42) ''' # load all test reviews positive_docs = process_docs('data/pos_test', vocab, False) negative_docs = process_docs('data/neg_test', vocab, False) test_docs = negative_docs + positive_docs # sequence encode encoded_docs = tokenizer.texts_to_sequences(test_docs) # pad sequences Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post') # define test labels ytest = np.array([0 for _ in range(len(listdir("data/neg_test")))] + [1 for _ in range(len(listdir("data/pos_test")))]) ''' print("\n pad_sequences : ",Xtest) print("\n ytest : ",ytest) # define vocabulary size (largest integer value) vocab_size = len(tokenizer.word_index) + 1 # define model model = Sequential() model.add(Embedding(vocab_size, 100, input_length=max_length)) model.add(Conv1D(filters=64, kernel_size=8, activation='relu')) model.add(MaxPooling1D(pool_size=2)) model.add(Conv1D(filters=32, kernel_size=8, activation='relu')) model.add(MaxPooling1D(pool_size=2)) model.add(Flatten()) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='sigmoid')) print(model.summary()) # compile network model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # fit network model.fit(Xtrain, ytrain, epochs=20, verbose=1) # evaluate loss, acc = model.evaluate(Xtest, ytest, verbose=0) print('Test Accuracy: %f' % (acc*100)) model.save("relevancy_model_v2.0.1.h5") print("Done!")
if wc > 2: input_x.append(text) input_y.append( (float(rating) - 1.0)/4.0 ) print("%d texts %d ratings" % ( len(input_x), len(input_y) ) ) max_length = max([len(s.split()) for s in input_x]) print("max_length = %d" % max_length) #print(input_x.shape) #quit() from tensorflow.python.keras.preprocessing.text import Tokenizer from tensorflow.python.keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical tokenizer = Tokenizer() input_x = np.array(input_x) tokenizer.fit_on_texts(input_x) sequences = tokenizer.texts_to_sequences(input_x) word_index = tokenizer.word_index review_pad = pad_sequences(sequences, maxlen=max_length) #ratings = np.array(input_y)/5.0 ratings = np.array(input_y) inv_index = {} for w, i in word_index.items(): inv_index[i]=w #ratings = to_categorical(ratings) print("%d tokens, reviews shape: %s, ratings shape: %s" % (len(word_index), review_pad.shape, ratings.shape))
training_x = [] for i, question in enumerate(questions): # tokenize tokens = nltk.word_tokenize(question, language='german') # remove punctution table = str.maketrans("", "", string.punctuation) stripped = [w.translate(table) for w in tokens] # remove non-alphabetic/non-numeric real_tokens = [word for word in stripped if word.isalpha() or word.isnumeric()] # stemming sequence = [snowball.stem(token) for token in real_tokens] training_x.append(sequence) tokenizer = Tokenizer() tokenizer.fit_on_texts(training_x) def index(request): msg = "" if request.method == 'POST': user_message = request.POST.get('nachricht', False) print(user_message) with open("chat.txt", "a+") as f: f.write("("+str(timezone.now())+")"+" Sie: ") f.write(user_message + "\n") f.write("(" + str(timezone.now()) + ")" + " Chatbot: ") msg = answer(user_message) f.write(msg+"\n") f = open("chat.txt", "r")
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다'] for sentence in train_data['document']: temp_x=[] temp_x = okt.morphs(sentence, stem=True, norm=True) # norm은 현대적인말 그래욬ㅋㅋㅋ -> 그래요 # stem은 그래요 -> 그렇다 원형으로 바꾸어 준다 temp_x = [word for word in temp_x if not word in stopwords] train_X.append(temp_x) # train_y=['송금', '잔액'] print(train_X) train_seq=[] #test_seq=[] # 전체 단어 개수 중 빈도수 2이하인 단어 개수는 제거. 0번 패딩 토큰을 고려하여 +1 tokenizer = Tokenizer(19416) tokenizer.fit_on_texts(train_X) train_seq = tokenizer.texts_to_sequences(train_X) train_lab = tokenizer.texts_to_sequences(train_y) print(train_seq) print(train_lab) from tensorflow.python.keras.preprocessing.sequence import pad_sequences MAX_SEQUENCE_LENGTH = 8 train_inputs=[] test_inputs=[] train_inputs = pad_sequences(train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post') lab_inputs = pad_sequences(train_lab, maxlen=1, padding='post') print(lab_inputs) print(train_inputs.shape)
max_headline_length = 22 #max length of headline is 22. Since the data follows an approximately normal distribution, we set that as the maximum max_text_length = 1500 #a majority of texts are below 1500 with several outliers, so we set max text length to 1200 # In[13]: x_train, x_test, y_train, y_test = train_test_split(df['cleaned text'], df['cleaned headline'], random_state=0, shuffle=True) # In[14]: #text tokenization. Credits go to https://www.analyticsvidhya.com/blog/2019/06/comprehensive-guide-text-summarization-using-deep-learning-python/ #word2vec, bag of words, glove or word embeddings? x_tokenizer = Tokenizer() x_tokenizer.fit_on_texts(list(x_train)) x_train = x_tokenizer.texts_to_sequences(x_train) x_test = x_tokenizer.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=max_text_length, padding='post') x_test = pad_sequences(x_test, maxlen=max_text_length, padding='post') x_voc_size = len(x_tokenizer.word_index) + 1 # In[15]: y_tokenizer = Tokenizer() y_tokenizer.fit_on_texts(list(y_train))
#creating an embedding dictionary print("creating embedding_index\n") embeddings_index = {} ''' f = open("all_in_one_20/cbow_model.txt") for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:]) embeddings_index[word] = coefs f.close() ''' #vetorize the text samples print("vetorizing the text descriptions\n") tokenizer_obj = Tokenizer() tokenizer_obj.fit_on_texts(X) sequences = tokenizer_obj.texts_to_sequences(X) #padding sequences word_index = tokenizer_obj.word_index padded_pre_text = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post') #######NO NEED TO SHUFFLE AGAIN SINCE ITS ALREADY SHUFFLED###### ''' #shuffling features and labels (ie shuffling both in same order) indices = np.arange(padded_pre_text.shape[0]) np.random.shuffle(indices)
def read_wordembedding(file_name): embedding_index = {} f = open(os.path.join('',file_name), encoding='utf-8') for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:]) embedding_index[word]=coefs f.close() return embedding_index docs = preprocessing_dataset(X) del X tokenizer_object = Tokenizer() tokenizer_object.fit_on_texts(docs) max_length = max([len(doc) for doc in docs]) max_length_index = np.argmax([len(doc) for doc in docs]) sequences = tokenizer_object.texts_to_sequences(docs) #pad sequence word_index = tokenizer_object.word_index doc_pad = pad_sequences(sequences, maxlen=max_length) word_embedding_file_name = 'word2vector2.txt' embedding_index = read_wordembedding(word_embedding_file_name) num_words = len(word_index)+1 def average_embedding(word_index,embedding_index):
def load_data(): train = pd.read_csv(FLAGS.input_training_data_path + '/train.csv') #.iloc[:200] test = pd.read_csv(FLAGS.input_training_data_path + '/test.csv') #.iloc[:200] # sub1 = pd.read_csv(data_dir + '/submission_ensemble.csv') nrow = train.shape[0] print("Train Size: {0}".format(nrow)) print("Test Size: {0}".format(test.shape[0])) coly = [c for c in train.columns if c not in ['id', 'comment_text']] print("Label columns: {0}".format(coly)) y = train[coly] tid = test['id'].values if FLAGS.load_stacking_data: data_dir = "../../Data/2fold/" svd_features = np.load(data_dir + 'svd.npy') svd_train = svd_features[:nrow] svd_test = svd_features[nrow:] kf = KFold(n_splits=2, shuffle=False) for train_index, test_index in kf.split(svd_train): svd_train_part = svd_train[test_index] break train_data = np.load(data_dir + 'stacking_train_data.npy') print(train_data.shape, svd_train_part.shape) train_data = np.c_[train_data, svd_train_part] train_label = np.load(data_dir + 'stacking_train_label.npy') # train_data = train_data[:100] # train_label = train_label[:100] test_data = np.load(data_dir + 'stacking_test_data.npy') emb_weight = None else: df = pd.concat([train['comment_text'], test['comment_text']], axis=0) df = df.fillna("unknown") data = df.values # Text to sequence @contextmanager def timer(name): """ Taken from Konstantin Lopuhin https://www.kaggle.com/lopuhin in script named : Mercari Golf: 0.3875 CV in 75 LOC, 1900 s https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s """ t0 = time.time() yield print('[{0}] done in {1} s'.format(name, time.time() - t0)) with timer("Performing stemming"): if FLAGS.stem: # stem_sentence = lambda s: " ".join(ps.stem(word) for word in s.strip().split()) data = [gensim.parsing.stem_text(comment) for comment in data] print('Tokenizer...') if not FLAGS.char_split: tokenizer = Tokenizer(num_words=FLAGS.vocab_size) tokenizer.fit_on_texts(data) data = tokenizer.texts_to_sequences(data) data = pad_sequences(data, maxlen=FLAGS.max_seq_len) if FLAGS.load_wv_model: emb_weight = get_word2vec_embedding(location = FLAGS.input_training_data_path + FLAGS.wv_model_file, \ tokenizer = tokenizer, nb_words = FLAGS.vocab_size, embed_size = FLAGS.emb_dim, \ model_type = FLAGS.wv_model_type, uniform_init_emb = FLAGS.uniform_init_emb) else: if FLAGS.uniform_init_emb: emb_weight = np.random.uniform( 0, 1, (FLAGS.vocab_size, FLAGS.emb_dim)) else: emb_weight = np.zeros((FLAGS.vocab_size, FLAGS.emb_dim)) else: tokenizer = None data_helper = data_helper(sequence_max_length = FLAGS.max_seq_len, \ wv_model_path = FLAGS.input_training_data_path + FLAGS.wv_model_file, \ letter_num = FLAGS.letter_num, emb_dim = FLAGS.emb_dim, load_wv_model = FLAGS.load_wv_model) data, emb_weight, FLAGS.vocab_size = data_helper.text_to_triletter_sequence( data) train_data, train_label = data[:nrow], y.values[:nrow] test_data = data[nrow:] return train_data, train_label, test_data, coly, tid, emb_weight
#neu 0 #neg 1 #pos 2 text = [] label = [] for i in df.Text: text.append(i) for i in df.Label: label.append(i) ########## text preprocessing ######################################### from tensorflow.python.keras.preprocessing.text import Tokenizer from tensorflow.python.keras.preprocessing.sequence import pad_sequences tokenizer_obj = Tokenizer() tokenizer_obj.fit_on_texts(text) #pad sequences max_length = max([len(s.split()) for s in text]) #define vocablury size vocab_size = len(tokenizer_obj.word_index) + 1 print(max_length) print(vocab_size) ##generating tokens text_token = tokenizer_obj.texts_to_sequences(text) ##adding padding
for line in lines: tokens = word_tokenize(line) tokens = [w.lower() for w in tokens] table = str.maketrans('','',string.punctuation) stripped = [w.translate(table) for w in tokens] words = [word for word in stripped if word.isalpha()] stop_words = set(stopwords.words('english')) words = [w for w in words if not w in stop_words] triple_lines.append(words) print(colored(len(triple_lines),'green')) EMBEDDING_DIM = 200 #Vectorize the text samples into a S2 integer tensor tokenizer_obj = Tokenizer() tokenizer_obj.fit_on_texts(triple_lines) sequences = tokenizer_obj.texts_to_sequences(triple_lines) #pad sequences : add padding to make all the vectors of same length #define vocabulary size vocab_size = len(tokenizer_obj.word_index) + 1 print(colored(sequences,'green')) #pad sequences word_index = tokenizer_obj.word_index max_length = 9 triple_pad = pad_sequences(sequences, maxlen=max_length)
output = np.floor(output) output = output.astype(np.int16) return output def quanti_convert_int16_to_float(data, fix_pos): amp = 2**fix_pos output = data.astype(np.float32) output = data / amp return output for idx, row in complain_data.iterrows(): row[0] = row[0].replace('rt', ' ') tokenizer = Tokenizer(num_words=vocab_size, split=' ') tokenizer.fit_on_texts(complain_data['Customer_Service'].values) X = tokenizer.texts_to_sequences(complain_data['Customer_Service'].values) max_sentence_len = 50 X = pad_sequences(X, maxlen=max_sentence_len) Y = complain_data['Satisfaction'].values X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42) embedding_vector_length = 32 model = Sequential()
NUMBER_OF_UNIQUE_WORDS_CONSIDERED = 25000 LENGTH_OF_SENTENCES = 250 DROPOUT_FACTOR = 0.1 BATCH_SIZE = 32 EPOCHS = 2 ############################################################ ########## Creating and training the Neural Network ############################################################ train_set = pd.read_csv('train.csv') sentences_train_set = train_set["comment_text"] tokenizer = Tokenizer(num_words=NUMBER_OF_UNIQUE_WORDS_CONSIDERED) tokenizer.fit_on_texts(list(sentences_train_set)) tokenized_train_set = tokenizer.texts_to_sequences(sentences_train_set) columns = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] y = train_set[columns].values X_t = pad_sequences(tokenized_train_set, maxlen=LENGTH_OF_SENTENCES) inputs = Input(shape=(LENGTH_OF_SENTENCES, )) x = Embedding(NUMBER_OF_UNIQUE_WORDS_CONSIDERED, 128)(inputs) x = LSTM(80, return_sequences=True, name='lstm')(x) x = GlobalMaxPool1D()(x) x = Dropout(DROPOUT_FACTOR)(x) x = Dense(50, activation="relu")(x)