def compute_features_vector(file='FEATURES_test.json', answers=question.get_question(0)): for index, answer in enumerate(answers): score = answer['score_1'] text = answer['text'] text = remove_stops(tokenizer(text)) len1 = len(text) len2 = len(tokenizer(text)) feature = feature_extraction(text) cosin = feature.cosine_similarity() keywords = feature.keywords_similarity() _, _, _, LSA = feature.lsa_similarity() part_words, _ = feature.partial_words_similarity() language = feature.language_similirity() lda = feature.lda_similarity() al1, al2 = feature.align_similarity() c1, c2 = feature.corpus_similarity() lda_ext = feature.lda_extract() cos_sc = feature.cos_score() bingo_sc = feature.bingo_score() jacc = feature.jaccard() dic_sim = feature.dice() keys_norm = feature.keywords_norm() lsi_query = feature.LSI_query() bleu_score = feature.bleuscore() keyf = feature.keyf_score() fgram = feature.fgram_score() holo = feature.holo_score() info = [ cosin, keywords, LSA, part_words, language, lda, al1, c1, lda_ext, cos_sc, bingo_sc, jacc, dic_sim, keys_norm, lsi_query, bleu_score, keyf, fgram, holo, score ] print(info) info = [float(x) for x in info] data['data'].append(info) f = open(file, 'w') json.dump(data, f, indent=2) f.close() print("Index = ", index) # write features to file f = open(file, 'w') json.dump(data, f, indent=2) f.close()
def main(): """ The main function. Arguments: 1. Takes no arguments. """ train_data = data.load_training_data() # function call to load training data test_data = data.load_test_data() # function call to load test data count = CountVectorizer() # initialize the count vector tfidf_transformer = TfidfTransformer() # initialize a tfidf transformer models_dict = {} # empty dict train_tfidf = features.feature_extraction(train_data, count, tfidf_transformer) # function call for feature extraction bayes = naive_bayes(train_data, train_tfidf) # function call to fit the Naive Bayes classifier models_dict['Naive Bayes'] = bayes # add models to dictionary svm = svm_classifier(train_data, train_tfidf) # function call to fit SVM Classifier models_dict['SVM'] = svm # add models to a dictionary rand_forest = random_forest_classifier(train_data, train_tfidf) # function to build random forest classifier models_dict['Random Forest'] = rand_forest # add models to dictionary logistic = logistic_regression_classifier(train_data, train_tfidf) # function call to build logistic regression models_dict['Logistic Regression'] = logistic # add models to dictionary decision_tree = decision_tree_classifier(train_data, train_tfidf) # function call for decision tree classifier models_dict['Decision Tree'] = decision_tree # add model to the dictionary predict_test_data(train_data, test_data, models_dict, count, tfidf_transformer, train_tfidf)
def predict_test_data(train_data, test_data, models_dict, count, tfidf_transformer, train_tfidf): """ This function helps to predict the class labels for the test data based on the models. Arguments: 1. train_data: Train data which is a dictionary with features and class labels. 2. test_data: Test data which is a dictionary with features and class labels. 3. models_dict: The models object for Bernoulli and multinomial given in a dictionary. """ for model in models_dict: # iterate through each model print model test_tfidf = features.feature_extraction(test_data, count, tfidf_transformer, data='test') # function call to get TFIDF for test data predicted = models_dict[model].predict(test_tfidf) # get the predicted values for the test data predicted_cv = cross_validation.cross_val_predict(models_dict[model], train_tfidf, train_data.target, cv=10) # value prediction using cross validation accuracy_cv = np.mean(predicted_cv == train_data.target) * 100 # calculate accuracy with cross validation accuracy = np.mean(predicted == test_data.target) * 100 # accuracy calculation print_results(test_data, predicted, accuracy, model) # function call to print results print_results(train_data, predicted_cv, accuracy_cv, model + ' with cv')
# Signal conditioning filter design ######################################### if silence == 0: print("Designing filter with %d taps" % numtaps) # Set filter specs lpf = remez(numtaps=numtaps, bands=cutoff, desired=[1.0, 0.0], Hz=fs) ######################################### # Data filtering and Feature extraction for Training data ######################################### if silence == 0: print("Passing the dataset through the filter") filtered_train, valid_labels_train = utils.data_filtering( X_train_raw, y_train_raw, lpf) if silence == 0: print("Calculating features...") if silence == 0: print(features) X_train, y_train = fe.feature_extraction(features, filtered_train, valid_labels_train, fs, window) ######################################### # Overrides for handcrafted test data, for demo ######################################### if cheat_test == 1: print("Handcrafting test set, for demonstration. You're cheating") np.random.seed(1) # for reproducibility of results X_test_raw = np.concatenate((np.random.random(n_handcraft) * 3 - 3, np.random.random(n_handcraft) * 500 - 500)) X_test_raw = X_test_raw.reshape((len(X_test_raw), 1)) y_test_raw = np.concatenate((np.zeros( (n_handcraft, 1)), np.ones((n_handcraft, 1)))) print("Overriding window size and sampling frequency") window = 32 fs = 32
import numpy as np from features import feature_extraction dataObj = feature_extraction() train, validation, test = dataObj.process_text(dataObj.data) # calculate the most common words in data set # will output a text file (common_words.txt) dataObj.calc_common_words(dataObj.data) # calculate the frequency if common words occurrences per comment # now each comment has the frequency of all the most common words in the corpus common_words = dataObj.calc_freq_words( dataObj.data) # common_words is an array print("done")