def main(): start = time.time() # get picke paths pickle_training = config.general[ "pickle_data_path"] / "training_data.pickle" picle_private_test = config.general[ "pickle_data_path"] / "private_test_data.pickle" picle_public_test = config.general[ "pickle_data_path"] / "public_test_data.pickle" # try to load the pickels training_data = jn.picke_load(pickle_training) private_test_data = jn.picke_load(picle_private_test) public_test_data = jn.picke_load(picle_public_test) # if there is no pickels extract datasets from csv, transform them and save them in pickels if training_data == None or private_test_data == None or public_test_data == None: jn.create_dir(config.general["pickle_path"]) jn.create_dir(config.general["pickle_data_path"]) training_data, private_test_data, public_test_data = extract_data() jn.pickle_save(training_data, pickle_training) jn.pickle_save(private_test_data, picle_private_test) jn.pickle_save(public_test_data, picle_public_test) print("Extraction and preprocessing time: ", str(time.time() - start)) # launch a list of cnn cnn.cnn_A(training_data, private_test_data, public_test_data) cnn.cnn_B(training_data, private_test_data, public_test_data) cnn.cnn_C(training_data, private_test_data, public_test_data) cnn.cnn_D(training_data, private_test_data, public_test_data) cnn.cnn_E(training_data, private_test_data, public_test_data) cnn.cnn_F(training_data, private_test_data, public_test_data) cnn.cnn_G(training_data, private_test_data, public_test_data) cnn.cnn_H(training_data, private_test_data, public_test_data) cnn.cnn_I(training_data, private_test_data, public_test_data) cnn.cnn_J(training_data, private_test_data, public_test_data) cnn.cnn_K(training_data, private_test_data, public_test_data) cnn.cnn_L(training_data, private_test_data, public_test_data) cnn.cnn_M(training_data, private_test_data, public_test_data) cnn.cnn_N(training_data, private_test_data, public_test_data) cnn.cnn_O(training_data, private_test_data, public_test_data) cnn.cnn_P(training_data, private_test_data, public_test_data)
def cnn_M(training_data, private_test_data, public_test_data): start = time.time() cnn = { "id": "cnn_M", "epochs": EPOCHS, "optimizer": "Adam", "loss": "categorical_crossentropy", "metrics": ["accuracy"] } training, training_labels, training_pixels = training_data private_test, private_test_labels, private_test_pixels = private_test_data public_test, public_test_labels, public_test_pixels = public_test_data # init cnn model = models.Sequential() model.add( layers.Conv2D(32, (3, 3), padding="same", activation='relu', input_shape=(48, 48, 1))) model.add(layers.Conv2D(64, (3, 3), padding="same", activation='relu')) model.add(layers.MaxPooling2D(pool_size=(3, 3), strides=(2, 2))) model.add(layers.Conv2D(128, (3, 3), padding="same", activation='relu')) model.add(layers.MaxPooling2D(pool_size=(3, 3), strides=(2, 2))) model.add(layers.Conv2D(256, (1, 1), padding="same", activation='relu')) model.add(layers.MaxPooling2D(pool_size=(3, 3), strides=(2, 2))) model.add(layers.Flatten()) # add layers model.add(layers.Dense(2304, activation='relu')) model.add(layers.Dropout(0.5)) model.add(layers.Dense(2304, activation='relu')) model.add(layers.Dropout(0.5)) model.add(layers.Dense(2304, activation='relu')) model.add(layers.Dropout(0.2)) model.add(layers.Dense(7, activation='softmax')) model.summary() # launch cnn model.compile(optimizer=cnn["optimizer"], loss=cnn["loss"], metrics=cnn["metrics"]) hist = model.fit(training_pixels, training_labels, batch_size=256, epochs=cnn["epochs"], validation_data=(private_test_pixels, private_test_labels)) test_loss, test_accuracy = model.evaluate(public_test_pixels, public_test_labels) print("Trained cnn -> ", cnn["id"]) print("CNN training time: ", str(time.time() - start)) print("public test acc -> ", test_accuracy) print("public test loss -> ", test_loss) jn.create_dir(config.general["pickle_history_path"]) saving_history_path = config.general["pickle_history_path"] / str( "history_" + cnn["id"] + ".pickle") history = { "id": cnn["id"], "epochs": cnn["epochs"], "history": hist.history, "test": [test_accuracy, test_loss] } jn.pickle_save(history, saving_history_path)
def main(): start_time = time.time() #stopwords_txt = set(open(config.general['stopwords']).read().split()) stopwords = set(nltk.corpus.stopwords.words('english')) stopwords.update(set(STOPWORDS)) pickle_book_file = config.general["pickle_path"] / "book_list.pickle" # try to load pickel book_list = jn.pickle_load(pickle_book_file) # if there is no pickels extract corpus if book_list == None: if not jn.create_dir(config.general["pickle_path"]): return 1 book_list = extract_books(config.general["corpus_path"]) # cut title, add paragraphs and sentences for book in book_list: book["original"]["tokens"] = word_tokenize(book["original"]["text"]) book["original"]["paragraphs"] = blankline_tokenize(book["original"]["text"]) book["original"]["sentences"] = sent_tokenize(book["original"]["text"]) # add tokens bigrams and trigrams for book in book_list: token_list, bigram_list, trigram_list = [], [], [] for sntc in book["original"]["sentences"]: tokens = word_tokenize(sntc) bigrams = list(nltk.bigrams(tokens)) trigrams = list(nltk.trigrams(tokens)) token_list.append(tokens) bigram_list.append(bigrams) trigram_list.append(trigrams) book["original"]["token_list"] = token_list book["original"]["bigram_list"] = bigram_list book["original"]["trigram_list"] = trigram_list #for word in book.lower().split(): #preprocessed text punctuation = re.compile(r'[.,?!:;()|0-9]') #- for book in book_list: preprocessed_sentences = [] preprocessed_token_list, preprocessed_bigram_list, preprocessed_trigram_list = [], [], [] cleaned_token_list = [] new_text = re.sub(r'[^\w\s]', '', book["original"]["text"]) preprocessed_text = new_text.lower() for sntc_tokenized in book["original"]["sentences"]: new_sentence = re.sub(r'[^\w\s]', '', sntc_tokenized) new_sentence = new_sentence.lower() new_tokens = word_tokenize(new_sentence) new_bigrams = list(nltk.bigrams(new_tokens)) new_trigrams = list(nltk.trigrams(new_tokens)) cleaned_tokens = [] for word in new_tokens: if word not in stopwords: cleaned_tokens.append(word) preprocessed_sentences.append(new_sentence) preprocessed_token_list.append(new_tokens) preprocessed_bigram_list.append(new_bigrams) preprocessed_trigram_list.append(new_trigrams) cleaned_token_list.append(cleaned_tokens) book["original"]["tokens"] = word_tokenize(book["original"]["text"]) book["preprocess"] = { "text": preprocessed_text, "tokens": word_tokenize(preprocessed_text), "sentences": preprocessed_sentences, "token_list": preprocessed_token_list, "brigram_list": preprocessed_bigram_list, "trigram_list": preprocessed_trigram_list, "cleaned_tokens": cleaned_token_list } # add word freq for book in book_list: fdist_original = FreqDist(word for word in word_tokenize(book["original"]["text"])) book["original"]["token_frequency"] = dict(fdist_original.items()) fdist_preprocess = FreqDist(word.lower() for word in word_tokenize(book["preprocess"]["text"])) book["preprocess"]["token_frequency"] = dict(fdist_preprocess.items()) jn.pickle_save(book_list, pickle_book_file) # TF.IDF pickle_tfidf_file = config.general["pickle_path"] / "tf_idf_dictionary.pickle" # try to load pickel tf_idf_dictionary = jn.pickle_load(pickle_tfidf_file) # if there is no pickels extract corpus if tf_idf_dictionary == None: token_set = [book["preprocess"]["tokens"] for book in book_list] tf_idf_dictionary = tf_idf.get_tf_idf(token_set) jn.pickle_save(tf_idf_dictionary, pickle_tfidf_file) # TF.IDF without stopwords pickle_tfidf_nsw_file = config.general["pickle_path"] / "tf_idf_dictionary_nsw.pickle" # try to load pickel tf_idf_dictionary_nsw = jn.pickle_load(pickle_tfidf_nsw_file) # if there is no pickels extract corpus if tf_idf_dictionary_nsw == None: token_set = [book["preprocess"]["tokens"] for book in book_list] token_set_nsw = [] for tokens in token_set: tokens_nsw = [] for word in tokens: if word not in stopwords: tokens_nsw.append(word) token_set_nsw.append(tokens_nsw) tf_idf_dictionary_nsw = tf_idf.get_tf_idf(token_set_nsw) jn.pickle_save(tf_idf_dictionary_nsw, pickle_tfidf_nsw_file) print("--- Preprocessing lasts %s seconds ---" % (time.time() - start_time)) atmom = book_list[56] dagon = book_list[1] asd = tf_idf_dictionary_nsw[1] asd = {k: v for k, v in asd.items() if v > 0.002} text = dagon["original"]["text"] nlp = spacy.load("en_core_web_sm") doc = nlp(text) displacy.serve(doc, style="ent") # return 1 tkn = atmom["preprocess"]["token_list"][0] # POS hey = nltk.pos_tag(tkn) # NER hoy = ne_chunk(hey) tkn2 = atmom["original"]["token_list"][0] # POS hey2 = nltk.pos_tag(tkn2) # NER hoy2 = ne_chunk(hey2) print(hey) print(hoy) print(hey2) print(hoy2) asd = 1 #pst = LancasterStemmer() #print(atmom["sentences"][0]) #print(pst.stem(atmom["sentences"][0])) q1 = "The big cat ate the little mouse who was after fresh cheese" nw_tk = nltk.pos_tag(word_tokenize(q1)) print(nw_tk) grammar_np = r"NP: {<DT>?<JJ>*<NN>}" chunk_parser = nltk.RegexpParser(grammar_np) chunk_result = chunk_parser.parse(nw_tk) print(chunk_result) return 1 #data = [ # [(word.replace(",", "") # .replace(".", "") # .replace("(", "") # .replace(")", "")) # for word in row[2].lower().split()] # for row in reader] ## Removes header #data = data[1:] all_sentences = "" all_preprocessed_sentences = "" for book in book_list: for sntc in book["original"]["sentences"]: all_sentences = all_sentences + "\n" + sntc for sntc in book["preprocess"]["sentences"]: all_preprocessed_sentences = all_preprocessed_sentences + "\n" + sntc print("There are {} words in the combination of all review.".format(len(all_sentences))) # Create and generate a word cloud image: #wordcloud = WordCloud().generate(text) #wordcloud = WordCloud(max_words=30, background_color="white", collocations=False).generate(text) #wordcloud.to_file("img/first_review.png") #plt.imshow(wordcloud, interpolation='bilinear') #plt.axis("off") #plt.show() wordcloud = WordCloud(stopwords=stopwords, max_words=50, background_color="white", collocations=False).generate(all_sentences) wordcloud.to_file("img/review.png") # Display the generated image: #plt.figure() plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show() wordcloud = WordCloud(stopwords=stopwords, max_words=50, background_color="white", collocations=False).generate(all_preprocessed_sentences) wordcloud.to_file("img/refined_review.png") # Display the generated image: #plt.figure() plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show()