def main(argv): if len(argv) not in range(2, 6): programme_name = "lingofunk_classify_sentiment.model.hnatt.run" print(f"usage: PYTHONPATH=. python -m {programme_name} " "<category> <quantity> <embedding_name> <input_size> " "<learning rate>") sys.exit(2) category = argv[0] quantity = int(argv[1]) embeddings_path = None if len(argv) >= 3: embeddings_name = argv[2] embeddings_path = fetch( f'{config["embeddings"][embeddings_name]["basepath"]}.txt') if not os.path.isfile(embeddings_path): download_embedding(embeddings_name) if len(argv) >= 4: input_size = int(argv[3]) if len(argv) == 5: learning_rate = float(argv[4]) preprocessor_path = fetch(config["models"]["hnatt"]["preprocessor"]) preprocessor_dir = os.path.dirname(preprocessor_path) if not os.path.exists(preprocessor_dir): os.makedirs(preprocessor_dir) joblib.dump(normalize, preprocessor_path, compress=0) (train_X, train_y), (test_X, test_y) = load_balanced_train_and_test_dataframes( category, quantity, normalize, save_reviews) # initialize HNATT h = HNATT() h.train( train_X, train_y, batch_size=64, epochs=10, embeddings_path=embeddings_path, input_size=input_size, learning_rate=learning_rate, ) quantity = len(train_y) tag = str(date.today()) h.load_weights( weights_path=WEIGHTS_PATH_TEMPLATE.substitute(quantity=quantity, tag=tag), tokenizer_path=TOKENIZER_PATH_TEMPLATE.substitute(quantity=quantity, tag=tag), ) activation_maps = h.activation_maps( "they have some pretty interesting things here. i will definitely go back again." ) print(activation_maps)
def main(argv): if len(argv) != 2: programme_name = "lingofunk_classify_sentiment.model.naive_bayes.run" print( f"usage: PYTHONPATH=. python -m {programme_name} <category> <quantity>" ) sys.exit(2) category = argv[0] quantity = int(argv[1]) try: (pos_words, neg_words) = load_samples(category, quantity, remove_stopwords_and_include_bigrams, save_reviews) except Exception: print("The data for this category and quantity have not been found.") sys.exit(2) preprocessor_path = fetch(config["models"]["naive_bayes"]["preprocessor"]) joblib.dump(remove_stopwords_and_include_bigrams, preprocessor_path, compress=0) print(f"Category: {category}") (accuracy, classifier, train_set, test_set) = train(pos_words, neg_words) classifier.show_most_informative_features()
def train(pos_samples, neg_samples): model_path = fetch(config["models"]["naive_bayes"]["weights"]) samples = np.array(pos_samples + neg_samples) train_samples, test_samples = train_test_split(samples, test_size=0.2, random_state=42) if os.path.isfile(model_path): classifier = joblib.load(model_path).train(train_samples) else: classifier = nltk.NaiveBayesClassifier.train(train_samples) joblib.dump(classifier, model_path, compress=0) accuracy = nltk.classify.util.accuracy(classifier, test_samples) print(f"Finished training. The accuracy is {accuracy}.") test_trained_classifier(classifier, test_samples) return (accuracy, classifier, train_samples, test_samples)
def download_embedding(embedding): settings = config["embeddings"][embedding] glove_basepath = fetch(settings["basepath"]) glove_zip_path = f"{glove_basepath}.zip" glove_unzip_path = f"{glove_basepath}.txt" embedding_dirs = map(os.path.dirname, [glove_zip_path, glove_unzip_path]) for embedding_dir in embedding_dirs: if not os.path.exists(embedding_dir): os.makedirs(embedding_dir) glove_url = settings["url"] # Download the GloVe data if applicable if os.path.isfile(glove_zip_path): logger.info("GloVe data already exists, skipping download.") else: logger.info("Downloading GloVe data to {}".format(glove_zip_path)) try: args = ["wget", "-O", glove_zip_path, glove_url] output = subprocess.Popen(args, stdout=subprocess.PIPE) out, err = output.communicate() except: logger.info("Couldn't download GloVe data with wget, " "falling back to (slower) Python downloading.") glove_response = requests.get(glove_url, stream=True) with open(glove_zip_path, "wb") as glove_file: for chunk in glove_response.iter_content(chunk_size=1024 * 1024): # Filter out keep-alive new chunks. if chunk: glove_file.write(chunk) # Extract the GloVe data if it does not already exist. if os.path.exists(glove_unzip_path): logger.info("Unzipped GloVe data already exists, skipping unzip.") else: logger.info("Unzipping GloVe archive to {}".format(glove_unzip_path)) zip_ref = zipfile.ZipFile(glove_zip_path, "r") zip_ref.extractall(os.path.dirname(glove_unzip_path)) zip_ref.close()
from keras import backend as K from keras import initializers, regularizers from keras.callbacks import * from keras.engine.topology import Layer from keras.layers import * from keras.models import * from keras.optimizers import * from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.text import Tokenizer from keras.utils import CustomObjectScope from lingofunk_classify_sentiment.config import config, fetch from lingofunk_classify_sentiment.data.load import load_glove_embedding from lingofunk_classify_sentiment.model.hnatt.preprocess import normalize WEIGHTS_PATH_TEMPLATE = Template(fetch(config["models"]["hnatt"]["weights"])) TOKENIZER_PATH_TEMPLATE = Template( fetch(config["models"]["hnatt"]["tokenizer"])) MAX_VOCABULARY_SIZE = config["constants"]["max_vocabulary_size"] INPUT_SIZE = config["constants"]["input_size"] LEARNING_RATE = config["constants"]["learning_rate"] # Uncomment below for debugging # from tensorflow.python import debug as tf_debug # sess = K.get_session() # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # K.set_session(sess) def dot_with_kernel(x, kernel): """
# based on https://github.com/sfotiadis/yenlp/blob/master/extract_reviews.py import json import os import sys from string import Template from lingofunk_classify_sentiment.config import config, fetch business_data_filename = fetch(config["datasets"]["yelp"]["ids"]) reviews_data_filename = fetch(config["datasets"]["yelp"]["reviews"]) sample_template_filename = Template( fetch(config["datasets"]["yelp"]["sample_format"])) def get_business_ids(category): """Gets the business ids for the given category""" with open(business_data_filename) as businesses: business_ids = [] for business in businesses: business = json.loads(business) if business["categories"] and category in business[ "categories"].split(): business_ids.append(business["business_id"]) return business_ids def save_reviews(category, quantity): """Saves the given number of reviews of a specific category to two files, one for each class(pos/neg).""" pos_reviews_filename = sample_template_filename.substitute(
import json import os from string import Template import numpy as np from tqdm import tqdm import pandas as pd from lingofunk_classify_sentiment.config import config, fetch tqdm.pandas() sample_template_filename = Template(fetch(config["datasets"]["yelp"]["sample_format"])) def load_samples(category, quantity, preprocess, save=False): pos_reviews_fn = sample_template_filename.substitute( category=category.lower(), quantity=quantity, label="pos" ) neg_reviews_fn = sample_template_filename.substitute( category=category.lower(), quantity=quantity, label="neg" ) bothExist = os.path.isfile(pos_reviews_fn) and os.path.isfile(neg_reviews_fn) if not bothExist and save: save(category, quantity) pos_reviews = open(pos_reviews_fn, "r") neg_reviews = open(neg_reviews_fn, "r")