def fill_perplexity_columns(self, train_df, valid_df): print("Creating ngram counts...") self.create_counts(util.perplexity_clean(train_df)) train_clean = util.vectorizer_clean(train_df) valid_clean = util.vectorizer_clean(valid_df) dfs = [train_clean, valid_clean] for j, df in enumerate(dfs): for i in range(df.shape[0]): if i % 100 == 0: essay_set = None if j == 0: essay_set = "Train" else: essay_set = "Validation" print(essay_set + " essay " + str(i) + " of " + str(df.shape[0])) essay = df.get_value(i, 'essay') perp = self.perplexity(essay) df = df.set_value(i, 'perplexity', perp) train_df['perplexity'] = train_clean['perplexity'] valid_df['perplexity'] = valid_clean['perplexity'] return util.append_standardized_column(train_df, valid_df, 'perplexity')
def fill_sentence_column(train_df, valid_df): numOfSent_train = [] for essay in train_df['essay']: sent = sentences(essay) numOfSent_train.append(sent) numOfSent_valid = [] for essay in valid_df['essay']: sent = sentences(essay) numOfSent_valid.append(sent) train_df['sentence_count'] = numOfSent_train valid_df['sentence_count'] = numOfSent_valid train_df, valid_df = util.append_standardized_column( train_df, valid_df, 'sentence_count') return train_df, valid_df
def fill_total_words_column(train_df, valid_df, train_essays, valid_essays): #Total number of words total_number_train = [] total_number_valid = [] for i in range(len(train_essays)): splits = train_essays[i].split() total_words = len(splits) total_number_train.append(total_words) for i in range(len(valid_essays)): splits = valid_essays[i].split() total_words = len(splits) total_number_valid.append(total_words) train_df["total_words"] = total_number_train valid_df["total_words"] = total_number_valid train_df, valid_df = util.append_standardized_column(train_df, valid_df, 'total_words') return train_df, valid_df
def fill_unique_words_column(train_df, valid_df, train_essays, valid_essays): #percentage of unique words to the total number of words unique_word_percentages_train = [] unique_word_percentages_valid = [] for i in range(len(train_essays)): splits = train_essays[i].split() unique_words = len(Counter(splits)) unique_word_percentages_train.append(unique_words) for i in range(len(valid_essays)): splits = valid_essays[i].split() unique_words = len(Counter(splits)) unique_word_percentages_valid.append(unique_words) #Add the features to the dataset train_df["unique_words"] = unique_word_percentages_train valid_df["unique_words"] = unique_word_percentages_valid train_df, valid_df = util.append_standardized_column( train_df, valid_df, 'unique_words') return train_df, valid_df
def main(): print "Fetching data..." train_df = util.get_training_data('../data/training_set_rel3.tsv') valid_df = util.get_validation_data('../data/valid_set.tsv') print "Standardizing scores..." train_df, valid_df = util.append_standardized_column( train_df, valid_df, 'score') print "Calculating perplexity feature..." train_df, valid_df = Perplexity().fill_perplexity_columns( train_df, valid_df) print "Calculating number of sentences feature..." train_df, valid_df = fill_sentence_column(train_df, valid_df) print "Cleaning for spelling and word count..." # cleaned up data for spelling feature vectorizer_train_spelling = util.vectorizer_clean_spelling(train_df) train_essays_spelling = vectorizer_train_spelling['essay'].values vectorizer_valid_spelling = util.vectorizer_clean_spelling(valid_df) valid_essays_spelling = vectorizer_valid_spelling['essay'].values print "Calculating total words feature..." train_df, valid_df = fill_total_words_column(train_df, valid_df, train_essays_spelling, valid_essays_spelling) print "Calculating unique words feature..." train_df, valid_df = fill_unique_words_column(train_df, valid_df, train_essays_spelling, valid_essays_spelling) print "Calculating spelling feature..." # spelling feature train_df, valid_df = fill_spelling_column(train_df, valid_df, train_essays_spelling, valid_essays_spelling) print "Calculating pos tags features..." train_df, valid_df = fill_pos_columns(train_df, valid_df) print "Cleaning for TFIDF..." # cleaned up data for tfidf vector feature vectorizer_train = util.vectorizer_clean(train_df) train_essays = vectorizer_train['essay'].values vectorizer_valid = util.vectorizer_clean(valid_df) valid_essays = vectorizer_valid['essay'].values print "Calculating TFIDF features with unigram..." train_df, valid_df = fill_tfidf_column(train_df, valid_df, train_essays, valid_essays, 1) #print "Calculating TFIDF features with trigram..." #train_df, valid_df = fill_tfidf_column(train_df, valid_df, train_essays, valid_essays, 3) print train_df.head() print valid_df.head() COLS = [ 'essay_set', 'spelling_correct', 'std_sentence_count', 'std_unique_words', 'std_total_words', 'std_unique_words', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRT', 'PRON', 'VERB', '.', 'X', 'std_perplexity', 'std_score' ] train_df = train_df[COLS].join(train_df.filter(regex=("tfidf_*"))) valid_df = valid_df[COLS].join(valid_df.filter(regex=("tfidf_*"))) print train_df.shape print valid_df.shape max_essay_set = max(train_df['essay_set']) linreg_scores_df = pd.DataFrame(columns=['essay_set', 'p', 'spearman']) lasso_scores_df = pd.DataFrame( columns=['essay_set', 'alpha', 'p', 'spearman']) ridge_scores_df = pd.DataFrame( columns=['essay_set', 'alpha', 'p', 'spearman']) alphas = [x * 1.0 / 20 for x in range(20, 0, -1)] for i in range(1, max_essay_set + 1): print "" train_x = np.asarray((train_df[train_df['essay_set'] == i]).drop( ['essay_set', 'std_score'], axis=1)) train_std_scores = np.asarray( (train_df[train_df['essay_set'] == i])['std_score'], dtype="|S6").astype(np.float) regr = LinReg(fit_intercept=False, copy_X=False) regr.fit(train_x, train_std_scores) valid_x = np.asarray((valid_df[valid_df['essay_set'] == i]).drop( ['essay_set', 'std_score'], axis=1)) valid_pred_std_scores = regr.predict(valid_x) linreg_spear, p = Spearman( a=(valid_df[valid_df['essay_set'] == i])["std_score"], b=valid_pred_std_scores) linreg_scores_df = linreg_scores_df.append( { 'essay_set': i, 'p': p, 'spearman': linreg_spear }, ignore_index=True) print "Linear for Essay Set " + str(i) + ":", linreg_spear for a in alphas: ridge = linear_model.Ridge(alpha=a) ridge.fit(train_x, train_std_scores) valid_pred_std_scores_ridge = ridge.predict(valid_x) ridge_spear, p = Spearman( a=(valid_df[valid_df['essay_set'] == i])["std_score"], b=valid_pred_std_scores_ridge) ridge_scores_df = ridge_scores_df.append( { 'essay_set': i, 'alpha': a, 'p': p, 'spearman': ridge_spear }, ignore_index=True) print "Alpha = " + str(a) + " Ridge for Essay Set " + str( i) + ":", ridge_spear lasso = linear_model.Lasso(alpha=a) lasso.fit(train_x, train_std_scores) valid_pred_std_scores_lasso = lasso.predict(valid_x) lasso_spear, p = Spearman( a=(valid_df[valid_df['essay_set'] == i])["std_score"], b=valid_pred_std_scores_lasso) lasso_scores_df = lasso_scores_df.append( { 'essay_set': i, 'alpha': a, 'p': p, 'spearman': lasso_spear }, ignore_index=True) print "Alpha = " + str(a) + "Lasso for Essay Set " + str( i) + ":", lasso_spear print linreg_scores_df print ridge_scores_df print lasso_scores_df linreg_scores_df.to_pickle('linreg_scores-01.pickle') ridge_scores_df.to_pickle('ridge_scores-01.pickle') lasso_scores_df.to_pickle('lasso_scores-01.pickle')
from sklearn.linear_model import LogisticRegression as LogReg from sklearn.linear_model import LogisticRegressionCV as LogRegCV from sklearn.linear_model import LinearRegression as LinReg from scipy.stats import spearmanr as Spearman from sklearn import linear_model import pickle fig, ((ax1, ax2, ax5), (ax3, ax4, ax6)) = plt.subplots(2, 3) axes = [ax1, ax2, ax3, ax4, ax5, ax6] print("Fetching data...") train_df = util.get_training_data('../data/training_set_rel3.tsv') valid_df = util.get_validation_data('../data/valid_set.tsv') print("Standardizing scores...") train_df, valid_df = util.append_standardized_column(train_df, valid_df, 'score') print("Calculating perplexity feature...") train_df, valid_df = Perplexity().fill_perplexity_columns(train_df, valid_df) print("Calculating number of sentences feature...") train_df, valid_df = fill_sentence_column(train_df, valid_df) print("Cleaning for spelling and word count...") # cleaned up data for spelling feature vectorizer_train_spelling = util.vectorizer_clean_spelling(train_df) train_essays_spelling = vectorizer_train_spelling['essay'].values vectorizer_valid_spelling = util.vectorizer_clean_spelling(valid_df) valid_essays_spelling = vectorizer_valid_spelling['essay'].values print("Calculating total words feature...")