def logistic_regression_grid(x_train, y_train, x_test, y_test, class_ratio, make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"): utils.print_model_title("Logistic Regression") C_range = [0.001, 0.01, 0.1, 1, 10, 100] parameters = {'C': C_range} log_regr = LogisticRegression(C=1.0, class_weight=class_ratio, penalty='l2') grid_classifier(x_train, y_train, x_test, y_test, log_regr, parameters, make_feature_analysis, feature_names, top_features, plot_name)
def linear_svm_grid(x_train, y_train, x_test, y_test, class_ratio, make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"): utils.print_model_title("Linear SVM") C_range = get_regularization_params() parameters = {'C': C_range} linear_svm = LinearSVC(C=1.0, class_weight=class_ratio, penalty='l2') grid_classifier(x_train, y_train, x_test, y_test, linear_svm, parameters, make_feature_analysis, feature_names, top_features, plot_name)
def nonlinear_svm_grid(x_train, y_train, x_test, y_test, class_ratio, make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"): utils.print_model_title("Nonlinear SVM") C_range = get_regularization_params(a=-1, b=0, c=2, d=1, e=5) gamma_range = get_regularization_params(a=-2, b=-1, c=2, d=1, e=5) parameters = {'kernel': ['rbf'], 'C': C_range, 'gamma': gamma_range} nonlinear_svm = SVC(class_weight=class_ratio) grid_classifier(x_train, y_train, x_test, y_test, nonlinear_svm, parameters, make_feature_analysis, feature_names, top_features, plot_name)
def logistic_regression(x_train, y_train, x_test, y_test, class_ratio='balanced'): utils.print_model_title("Logistic Regression") regr = LogisticRegression(C=0.01, class_weight=class_ratio, penalty='l2') regr.fit(x_train, y_train) y_hat = regr.predict(x_test) utils.print_statistics(y_test, y_hat)
def baseline(tweets_train, train_labels, tweets_test, test_labels): # Import the subjectivity lexicon subj_dict = data_proc.get_subj_lexicon() types_of_features = ['1', '2', '3', 'ngrams'] for t in types_of_features: start = time.time() utils.print_model_title("Classification using feature type " + t) if t is '1': x_train_features = extract_baseline_features.get_features1( tweets_train, subj_dict) x_test_features = extract_baseline_features.get_features1( tweets_test, subj_dict) if t is '2': x_train_features = extract_baseline_features.get_features2( tweets_train, subj_dict) x_test_features = extract_baseline_features.get_features2( tweets_test, subj_dict) if t is '3': x_train_features = extract_baseline_features.get_features3( tweets_train, subj_dict) x_test_features = extract_baseline_features.get_features3( tweets_test, subj_dict) if t is 'ngrams': ngram_map, x_train_features = extract_baseline_features.get_ngram_features( tweets_train, n=1) x_test_features = extract_baseline_features.get_ngram_features_from_map( tweets_test, ngram_map, n=1) # Get the class ratio class_ratio = utils.get_classes_ratio_as_dict(train_labels) # Train on a Linear Support Vector Classifier print("\nEvaluating a linear SVM model...") classifiers.linear_svm(x_train_features, train_labels, x_test_features, test_labels, class_ratio) # Train on a Logistic Regression Classifier print("\nEvaluating a logistic regression model...") classifiers.logistic_regression(x_train_features, train_labels, x_test_features, test_labels, class_ratio) end = time.time() print( "Completion time of the baseline model with features type %s: %.3f s = %.3f min" % (t, (end - start), (end - start) / 60.0))
def baseline(tweets_train, train_labels, tweets_test, test_labels): subj_dict = dproc.get_subj_lexicon('hindi_lexicon.tff') types_of_features = ['1', '2', 'ngrams'] # '3' is removed for t in types_of_features: start = time.time() utils.print_model_title("Classification using features type " + t) if t is '1': x_train_features = extract_baseline_features.get_features1( tweets_train, subj_dict) x_test_features = extract_baseline_features.get_features1( tweets_test, subj_dict) if t is '2': x_train_features = extract_baseline_features.get_features2( tweets_train, subj_dict) x_test_features = extract_baseline_features.get_features2( tweets_test, subj_dict) #if t is '3': # x_train_features = extract_baseline_features.get_feature3(tweets_train, subj_dict) # x_test_features = extract_baseline_features.get_feature3(tweets_test, subj_dict) if t is 'ngrams': ngram_map, x_train_features = extract_baseline_features.get_ngram_features( tweets_train, n=1) x_test_features = extract_baseline_features.get_ngram_features_from_map( tweets_test, ngram_map, n=1) #get the class ratio class_ratio = utils.get_classes_ratio_as_dict(train_labels) # train on a linear Support Vector Classifer print('\n Evaluating a linear SVM model...') classifiers.linear_svm(x_train_features, train_labels, x_test_features, test_labels, class_ratio) #train on logistic regression classifiers.logistic_regression(x_train_features, train_labels, x_test_features, test_labels, class_ratio) end = time.time() print( "Completion time of the baseline model with features type %s: %.3f s = %.3f min" % (t, (end - start), (end - start) / 60.0))
def logistic_regression_grid(x_train, y_train, x_test, y_test, class_ratio, make_feature_analysis=False, feature_names=None, top_features=0, plot_name='coeff'): utils.print_model_title("Logistic Regression") C_range = [0.001, 0.01, 0.1, 1, 10, 100] #gamma_range = get_regularization_params(a=-2,b=-1,c=2,d=1,e=5) parameters = {'C': C_range} regr = LogisticRegression(C=1.0, class_weight=class_ratio, penalty='l2') grid_classifier(x_train, y_train, x_test, y_test, regr, parameters, make_feature_analysis, feature_names, top_features, plot_name)
def run_dl_analysis(train_tweets, test_tweets, y_train, y_test, path, shuffle=True, max_tweet_length=40, emb_type='glove', trainable=True, plot=True, dnn_models=None, epochs=50, batch_size=32, embedding_dim=300, hidden_units=256, dropout=0.5): if shuffle: train_tweets = utils.shuffle_words(train_tweets) test_tweets = utils.shuffle_words(test_tweets) # Convert all tweets into sequences of word indices tokenizer, train_indices, test_indices = utils.encode_text_as_word_indexes( train_tweets, test_tweets, lower=True) word_to_index = tokenizer.word_index print('There are %s unique tokens.' % len(word_to_index)) # Pad sequences with 0s x_train = pad_sequences(train_indices, maxlen=max_tweet_length, padding='post', truncating='post', value=0.) x_test = pad_sequences(test_indices, maxlen=max_tweet_length, padding='post', truncating='post', value=0.) print("Shape of the x train set ", x_train.shape) print("Shape of the x test set ", x_test.shape) ratio = utils.get_classes_ratio(train_labels) # Define the embedding layer (which will be the same for all the models) embedding_layer = build_embedding_layer(word_to_index, emb_type, embedding_dim, max_tweet_length, trainable) # Build the model for dnn_model in dnn_models: start = time.time() # Build the deep neural network architecture utils.print_model_title(dnn_model) model = build_model(max_tweet_length, embedding_layer, hidden_units, dropout, dnn_architecture=dnn_options(dnn_model)) # Compile the model my_optimizer = Adam(lr=0.0001, beta_1=0.9, beta_2=0.99, decay=0.01) model.compile(loss='categorical_crossentropy', optimizer=my_optimizer, metrics=['categorical_accuracy', utils.f1_score]) # Print the model summary print(model.summary()) if plot: # save an image of the current architecture plot_model(model, to_file=path + '/models/dnn_models/' + dnn_model.lower() + '_model_summary.png', show_shapes=True, show_layer_names=True) # Save the json representation of the model open( path + '/models/dnn_models/model_json/' + dnn_model.lower() + '_model.json', 'w').write(model.to_json()) # Prepare the callbacks save_best = ModelCheckpoint( monitor='val_categorical_accuracy', save_best_only=True, mode='auto', filepath=path + '/models/dnn_models/best/' + dnn_model.lower() + '_model.json.hdf5') reduceLR = ReduceLROnPlateau(monitor='val_categorical_accuracy', factor=0.1, patience=3, verbose=1) early_stopping = EarlyStopping(monitor='val_categorical_accuracy', patience=20, verbose=1) # Fit the model on the training data history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, shuffle=True, class_weight=ratio, callbacks=[save_best, reduceLR, early_stopping], validation_split=0.1, verbose=1) if plot: utils.plot_training_statistics(history, "/plots/dnn_models/" + dnn_model, also_plot_validation=False, acc_mode='categorical_accuracy', loss_mode='loss') # Load the best model model = utils.load_model( json_name=path + '/models/dnn_models/model_json/' + dnn_model.lower() + '_model.json', h5_weights_name=path + '/models/dnn_models/best/' + dnn_model.lower() + '_model.json.hdf5') # Make prediction and evaluation predict(model, x_test, y_test) end = time.time() print( "==================================================================\n" ) print("%s model analysis completion time: %.3f s = %.3f min" % (dnn_model, (end - start), (end - start) / 60.0)) print( "==================================================================\n" )
# Load the labels y_train = [ int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" + train_filename) ] y_test = [ int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" + test_filename) ] modes = ['binary', 'count', 'tfidf', 'freq'] results = DataFrame() # For each selection-mode, make a BoW analysis using both SVMs and a simple feed-forward NN for mode in modes: utils.print_model_title("BoW Analysis for Mode %s" % mode) tokenizer, x_train, x_test = utils.encode_text_as_matrix(train_tweets, test_tweets, mode, lower=True) word_to_indices = tokenizer.word_index index_to_word = {i: w for w, i in word_to_indices.items()} start = time.time() run_supervised_learning_models(x_train, y_train, x_test, y_test, make_feature_analysis=True, feature_names=index_to_word, top_features=20, plot_name="/bow_models/bow_%s_" % mode)
utils.initialize_writer(to_write_filename) train_filename = "train_sample.txt" test_filename = "test_sample.txt" tokens_filename = "clean_original_" data_path = path + "/res/tokens/tokens_" pos_path = path + "/res/pos/pos_" # Load data tokens and pos tags train_tokens = utils.load_file(data_path + tokens_filename + train_filename) test_tokens = utils.load_file(data_path + tokens_filename + test_filename) train_pos = utils.load_file(pos_path + tokens_filename + train_filename) test_pos = utils.load_file(pos_path + tokens_filename + test_filename) # Load the labels train_labels = [ int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" + train_filename) ] test_labels = [ int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" + test_filename) ] feature_sets = ['pragmatic', 'sentiment', 'syntactic', 'topic'] for feature_set in feature_sets: utils.print_model_title("Current feature: %s" % feature_set) build_model(train_tokens, train_pos, train_labels, test_tokens, test_pos, test_labels, feature_set)
def linear_svm(x_train, y_train, x_test, y_test, class_ratio='balanced'): utils.print_model_title("Linear SVM") svm = LinearSVC(C=0.01, class_weight=class_ratio, penalty='l2') svm.fit(x_train, y_train) y_hat = svm.predict(x_test) utils.print_statistics(y_test, y_hat)
path = os.getcwd()[:os.getcwd().rfind('/')] to_write_filename = path + '/stats/key_features_analysis_rule_based.txt' utils.initialize_writer(to_write_filename) train_filename = "train.txt" test_filename = "test.txt" tokens_filename = "clean_original_" data_path = path + "/res/tokens/tokens_" vocab_filename = path + "/res/vocabulary/vocabulary.txt" # Load the data train_tweets = utils.load_file(data_path + tokens_filename + train_filename) test_tweets = utils.load_file(data_path + tokens_filename + test_filename) # Load the labels train_labels = [ int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" + train_filename) ] test_labels = [ int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" + test_filename) ] # A rule-based approach used here to analyse the key-features that are actually learnt in a (non-)sarcastic context utils.print_model_title("Rule-based analysis") rule_based_comparison(train_tweets, train_labels, test_tweets, test_labels, vocab_filename)