def regressTest(original, target, test, numTargetCols): numFrames = len(test) linReg = LinReg(fit_intercept=True) predictedThings = linReg.fit(original, target).predict(test) predictedThings = predictedThings.reshape(numFrames, numTargetCols) return predictedThings
def linear_fit_and_score(essay_set, vectorizer, name): #Get all the text from data train_essays = (vectorizer_train[vectorizer_train['essay_set'] == essay_set])['essay'].values #Turn each text into an array of word counts train_vectors = vectorizer.fit_transform(train_essays).toarray() #Standardizing for y train_std_scores = np.asarray( (vectorizer_train[vectorizer_train['essay_set'] == essay_set] )['std_score'], dtype="|S6").astype(np.float) # print "\nStandardized Train Scores", train_std_scores[:5] ############## # Linear # ############## # Linear Model regr = LinReg(fit_intercept=False, copy_X=False) regr.fit(train_vectors, train_std_scores) valid_vectors = vectorizer.transform( (vectorizer_valid[vectorizer_valid['essay_set'] == essay_set] )['essay'].values).toarray() # My guess is we will want to denormalize these scores for quadratic weighted k valid_pred_std_scores = regr.predict(valid_vectors) print "Linear for Essay Set " + str( essay_set) + " with " + name + ":", Spearman( a=(valid_df[valid_df['essay_set'] == essay_set])["score"], b=valid_pred_std_scores) print "\n"
def main(): # leer csv df = pd.read_csv("resultados/compilado_mexico_temp_R.csv") df.sort_values(by="Año", inplace=True) # lons and lats lons = np.array(df["Long"]) lats = np.array(df["Lat"]) # ciclo para la generación de la predicción contador = 0 # dataframe new_df = pd.DataFrame( columns="Long Lat Año Ene Feb Mar Abr May Jun Jul Ago Sept Oct Nov Dic" .split()) for lon, lat in zip(lons, lats): # df temporal df_temporal = df.loc[(df["Long"] == lon) & (df["Lat"] == lat)] # ordenar valores por año grouped = df_temporal.groupby("Año").mean() # valores x y y X = grouped.index.values.reshape(-1, 1) y_Ene = grouped["Ene"].values y_Feb = grouped["Feb"].values y_Mar = grouped["Mar"].values y_Abr = grouped["Abr"].values y_May = grouped["May"].values y_Jun = grouped["Jun"].values y_Jul = grouped["Jul"].values y_Ago = grouped["Ago"].values y_Sept = grouped["Sept"].values y_Oct = grouped["Oct"].values y_Nov = grouped["Nov"].values y_Dic = grouped["Dic"].values # declarar la regresión reg_1 = LinReg() reg_2 = LinReg() reg_3 = LinReg() reg_4 = LinReg() reg_5 = LinReg() reg_6 = LinReg() reg_7 = LinReg() reg_8 = LinReg() reg_9 = LinReg() reg_10 = LinReg() reg_11 = LinReg() reg_12 = LinReg() # ajustar el modelo reg_1.fit(X, y_Ene) reg_2.fit(X, y_Feb) reg_3.fit(X, y_Mar) reg_4.fit(X, y_Abr) reg_5.fit(X, y_May) reg_6.fit(X, y_Jun) reg_7.fit(X, y_Jul) reg_8.fit(X, y_Ago) reg_9.fit(X, y_Sept) reg_10.fit(X, y_Oct) reg_11.fit(X, y_Nov) reg_12.fit(X, y_Dic) ene = [] feb = [] mar = [] abr = [] may = [] jun = [] jul = [] ago = [] sept = [] octu = [] nov = [] dic = [] new_longs = [] new_lats = [] anios = [] # predicción i = 2018 print("Long: {} Lat: {} Pred: {}".format(lon, lat, i)) ene.append(reg_1.predict(i)) feb.append(reg_2.predict(i)) mar.append(reg_3.predict(i)) abr.append(reg_4.predict(i)) may.append(reg_5.predict(i)) jun.append(reg_6.predict(i)) jul.append(reg_7.predict(i)) ago.append(reg_8.predict(i)) sept.append(reg_9.predict(i)) octu.append(reg_10.predict(i)) nov.append(reg_11.predict(i)) new_lats.append(lat) new_longs.append(lon) anios.append(i) contador += 1 print(contador) d = { "Long": np.array(new_longs), "Lat": np.array(new_lats), "Año": np.array(anios), "Ene": np.array(ene), "Feb": np.array(feb), "Mar": np.array(mar), "Abr": np.array(abr), "May": np.array(may), "Jun": np.array(jun), "Jul": np.array(jul), "Ago": np.array(ago), "Sept": np.array(sept), "Oct": np.array(octu), "Nov": np.array(nov), "Dic": np.array(dic) } new_df = new_df.append(d, ignore_index=True) new_df.to_csv("resultados/predicciones_temp.csv")
def main(): print "Fetching data..." train_df = util.get_training_data('../data/training_set_rel3.tsv') valid_df = util.get_validation_data('../data/valid_set.tsv') print "Standardizing scores..." train_df, valid_df = util.append_standardized_column( train_df, valid_df, 'score') print "Calculating perplexity feature..." train_df, valid_df = Perplexity().fill_perplexity_columns( train_df, valid_df) print "Calculating number of sentences feature..." train_df, valid_df = fill_sentence_column(train_df, valid_df) print "Cleaning for spelling and word count..." # cleaned up data for spelling feature vectorizer_train_spelling = util.vectorizer_clean_spelling(train_df) train_essays_spelling = vectorizer_train_spelling['essay'].values vectorizer_valid_spelling = util.vectorizer_clean_spelling(valid_df) valid_essays_spelling = vectorizer_valid_spelling['essay'].values print "Calculating total words feature..." train_df, valid_df = fill_total_words_column(train_df, valid_df, train_essays_spelling, valid_essays_spelling) print "Calculating unique words feature..." train_df, valid_df = fill_unique_words_column(train_df, valid_df, train_essays_spelling, valid_essays_spelling) print "Calculating spelling feature..." # spelling feature train_df, valid_df = fill_spelling_column(train_df, valid_df, train_essays_spelling, valid_essays_spelling) print "Calculating pos tags features..." train_df, valid_df = fill_pos_columns(train_df, valid_df) print "Cleaning for TFIDF..." # cleaned up data for tfidf vector feature vectorizer_train = util.vectorizer_clean(train_df) train_essays = vectorizer_train['essay'].values vectorizer_valid = util.vectorizer_clean(valid_df) valid_essays = vectorizer_valid['essay'].values print "Calculating TFIDF features with unigram..." train_df, valid_df = fill_tfidf_column(train_df, valid_df, train_essays, valid_essays, 1) #print "Calculating TFIDF features with trigram..." #train_df, valid_df = fill_tfidf_column(train_df, valid_df, train_essays, valid_essays, 3) print train_df.head() print valid_df.head() COLS = [ 'essay_set', 'spelling_correct', 'std_sentence_count', 'std_unique_words', 'std_total_words', 'std_unique_words', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRT', 'PRON', 'VERB', '.', 'X', 'std_perplexity', 'std_score' ] train_df = train_df[COLS].join(train_df.filter(regex=("tfidf_*"))) valid_df = valid_df[COLS].join(valid_df.filter(regex=("tfidf_*"))) print train_df.shape print valid_df.shape max_essay_set = max(train_df['essay_set']) linreg_scores_df = pd.DataFrame(columns=['essay_set', 'p', 'spearman']) lasso_scores_df = pd.DataFrame( columns=['essay_set', 'alpha', 'p', 'spearman']) ridge_scores_df = pd.DataFrame( columns=['essay_set', 'alpha', 'p', 'spearman']) alphas = [x * 1.0 / 20 for x in range(20, 0, -1)] for i in range(1, max_essay_set + 1): print "" train_x = np.asarray((train_df[train_df['essay_set'] == i]).drop( ['essay_set', 'std_score'], axis=1)) train_std_scores = np.asarray( (train_df[train_df['essay_set'] == i])['std_score'], dtype="|S6").astype(np.float) regr = LinReg(fit_intercept=False, copy_X=False) regr.fit(train_x, train_std_scores) valid_x = np.asarray((valid_df[valid_df['essay_set'] == i]).drop( ['essay_set', 'std_score'], axis=1)) valid_pred_std_scores = regr.predict(valid_x) linreg_spear, p = Spearman( a=(valid_df[valid_df['essay_set'] == i])["std_score"], b=valid_pred_std_scores) linreg_scores_df = linreg_scores_df.append( { 'essay_set': i, 'p': p, 'spearman': linreg_spear }, ignore_index=True) print "Linear for Essay Set " + str(i) + ":", linreg_spear for a in alphas: ridge = linear_model.Ridge(alpha=a) ridge.fit(train_x, train_std_scores) valid_pred_std_scores_ridge = ridge.predict(valid_x) ridge_spear, p = Spearman( a=(valid_df[valid_df['essay_set'] == i])["std_score"], b=valid_pred_std_scores_ridge) ridge_scores_df = ridge_scores_df.append( { 'essay_set': i, 'alpha': a, 'p': p, 'spearman': ridge_spear }, ignore_index=True) print "Alpha = " + str(a) + " Ridge for Essay Set " + str( i) + ":", ridge_spear lasso = linear_model.Lasso(alpha=a) lasso.fit(train_x, train_std_scores) valid_pred_std_scores_lasso = lasso.predict(valid_x) lasso_spear, p = Spearman( a=(valid_df[valid_df['essay_set'] == i])["std_score"], b=valid_pred_std_scores_lasso) lasso_scores_df = lasso_scores_df.append( { 'essay_set': i, 'alpha': a, 'p': p, 'spearman': lasso_spear }, ignore_index=True) print "Alpha = " + str(a) + "Lasso for Essay Set " + str( i) + ":", lasso_spear print linreg_scores_df print ridge_scores_df print lasso_scores_df linreg_scores_df.to_pickle('linreg_scores-01.pickle') ridge_scores_df.to_pickle('ridge_scores-01.pickle') lasso_scores_df.to_pickle('lasso_scores-01.pickle')
# plt.title("Yearly Average Land Temperature 1750-2015") # plt.xlabel("Year") # plt.ylabel("Yearly Average Land Temperature") # plt.show() # #print(df[times.year == 1752]) # print(df[np.isnan(df["LandAverageTemperature"])]) # df["LandAverageTemperature"] = df["LandAverageTemperature"].fillna(method="ffill") # print(df[times.year == 1752]) # Model. df = pd.read_csv("E:/OneDrive/Documents/python/test/GlobalTemperatures.csv") df = df.ix[:, :2] times = pd.DatetimeIndex(df["dt"]) grouped = df.groupby([times.year]).mean() x = grouped.index.values.reshape(-1, 1) y = grouped["LandAverageTemperature"].values reg = LinReg() reg.fit(x, y) y_preds = reg.predict(x) print('Accuracy: {}'.format(reg.score(x, y))) plt.figure(figsize=(15, 5)) plt.title('Linear Regression') plt.scatter(x=x, y=y_preds) plt.scatter(x=x, y=y, c='r') plt.show() print('Predicted temperature in 2050 will be {}'.format(reg.predict(2050)[0]))
#COLS = ['std_sentence_count', 'essay_set', 'std_score'] #train_df = train_df[COLS].join(train_df.filter(regex=("tfidf_*"))) #valid_df = valid_df[COLS].join(valid_df.filter(regex=("tfidf_*"))) for i in range(1, max_essay_set + 1): #vectorizer_train = util.vectorizer_clean(train_df) train_x = np.asarray( (train_df[train_df['essay_set'] == i]).drop('std_score', axis=1)) #train_x = np.asarray((train_df[train_df['essay_set'] == i])[['std_sentence_count']]) train_std_scores = np.asarray( (train_df[train_df['essay_set'] == i])['std_score'], dtype="|S6").astype(np.float) regr = LinReg(fit_intercept=False, copy_X=False) regr.fit(train_x, train_std_scores) valid_x = np.asarray( (valid_df[valid_df['essay_set'] == i]).drop('std_score', axis=1)) #valid_x = np.asarray((valid_df[valid_df['essay_set'] == i])[['std_sentence_count']]) valid_pred_std_scores = regr.predict(valid_x) #print "Linear for Essay Set "+str(i)+":", Spearman(a = (valid_df[valid_df['essay_set'] == i])["std_score"], b = valid_pred_std_scores) #print "\n" alpha = [x * 1.0 / 20 for x in range(21)] ridge_scores = [] lasso_scores = [] for a in alpha: ridge = linear_model.Ridge(alpha=a)
def main(): """Read and display csv data from two existing files-population,temprature.""" """https://www.kaggle.com/brajput24/world-population""" dfp = pd.read_csv('population.csv') displayheader('population.csv') dfp.index = dfp['Year'] dft = pd.read_csv('GlobalLandTemperaturesByCountry.csv') displayheader('GlobalLandTemperaturesByCountry.csv') """ Request user input a country name to investigate""" cn = input("Enter country: ") if not cn: print("Must provide a country name.") cn = cn.title() """ Use the country user inputed as a filter for both population and temperature dataset.""" maskp = dfp['Country'].isin([cn]) """ Only show Value which is population qty.""" dfp = dfp[maskp].filter(items=['Value']) print(dfp) print('\n') """Plot the population vs time gragh.""" plt.figure() plt.plot(dfp) plt.title('Population vs Time') plt.xlabel('Year') plt.ylabel('Value') """ Display population data info.""" showp = dfp.describe() print(showp) print('\n') """To examine two factors:p1-quantity increment of population; p2-ratio of population to last year's impacts on temperature change.""" pl = [] indarray = [] datap1 = [] datap2 = [] """Store all the population data of each year in list pl.""" for index, row in dfp.iterrows(): pl.append(row['Value']) for i in range(51): p1 = pl[i + 1] - pl[i] p2 = pl[i + 1] / pl[i] datap1.append(p1) datap2.append(p2) indarray.append(i + 1960) """create two series based on two factors p1,p2.""" s1_p = pd.Series(data=datap1, index=indarray) s2_p = pd.Series(data=datap2, index=indarray) """ Use the country user inputed as a filter for both population and temperature dataset.""" maskt = dft['Country'].isin([cn]) """Only show Average Temp.""" dft = dft[maskt].filter(items=['AverageTemperature', 'dt']) """group months data into year and take mean of it as inspired by wk11 seminar Data Science with Python.""" times = pd.DatetimeIndex(dft['dt']) dft = dft.fillna(method='ffill') group = dft.groupby([times.year]).mean() dft = group['AverageTemperature'] dft = dft[dft.index > 1959] dft = dft[dft.index < 2013] print(dft) print('\n') """Plot the temperature vs time gragh.""" plt.figure() plt.plot(dft) plt.title('AvgTemp vs Time') plt.xlabel('Time') plt.ylabel('Temp') showt = dft.describe() print(showt) tl = [] indarray = [] datat = [] logistic_judge = [] """Store all the temperature data of each year in list pl.""" for index, row in dft.iteritems(): tl.append(row) for i in range(51): ti = tl[i + 1] - tl[i] datat.append(ti) if ti > 0: logistic_judge.append(1) else: logistic_judge.append(0) """create index for judge series to use.""" array = [] for i in range(51): array.append(i + 1960) s_lrj = pd.Series(data=logistic_judge, index=array) """Combine two factors p1 and p2 and judge series into a dataframe lrdf.""" lrdf = { 'Population inc': s1_p, 'Population ratio': s2_p, 'Temp increased': s_lrj } lrdf = pd.DataFrame(lrdf) print('\n') '''Logistic regression analysis of dataframe lrdf- http://www.powerxing.com/logistic-regression-in-python/''' print(lrdf.head(8)) print(lrdf.describe()) print( pd.crosstab(lrdf['Temp increased'], lrdf['Population inc'], rownames=['Temp increased'])) lrdf.hist() plt.show() lrdf['intercept'] = 1.0 train_c = lrdf.columns[:-2] logit = sm.Logit(lrdf['Temp increased'], lrdf[train_c]) result = logit.fit() print(result.summary()) print('\n') """ LinearRegression analysis of population and temperature http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html""" x = dfp.index.values.reshape(-1, 1) y = dfp.values lreg = LinReg() lreg.fit(x, y) y_pred = lreg.predict(x) plt.figure() plt.title("Population Linear Regression") plt.scatter(x=x, y=y_pred) plt.scatter(x=x, y=y, c='r') print("Accuracy_Population: " + str(lreg.score(x, y))) print('\n') xt = dft.index.values.reshape(-1, 1) yt = dft.values lregt = LinReg() lregt.fit(xt, yt) yt_pred = lregt.predict(xt) plt.figure() plt.title("Temperature Linear Regression") plt.scatter(x=xt, y=yt_pred) plt.scatter(x=xt, y=yt, c='r') print("Accuracy_Temperature: " + str(lregt.score(xt, yt)))
return flex_index #---------------------GATHERING RESULTS---------------------# protein_listdir = 'C:/Users/Voltron Mini/Desktop/MLSummerStuff/Datasets/park_small' protein_file_list = os.listdir(protein_listdir) atom_data = compile_data(protein_file_list, protein_listdir) small_pCC_list = [] for index in range(len(atom_data)): C_data, N_data, O_data, S_data, CA_data, CNA_data = split_atom_data(atom_data[index]) mode_choice = 'CA' #Choose mode here X = compute_flex_index(atom_data[index], mode=mode_choice, kernel=['lorentz_ker', 'lorentz_ker', 'gaussian_ker'], \ kappa=[1,3,1], eta=[16,2,31]) y = CA_data[:,3] reg = LinReg().fit(X,y) ypred = reg.predict(X) pCC = stats.pearsonr(ypred, y)[0] small_pCC_list.append(pCC) small_pCC_array = np.array(small_pCC_list) small_avg_pCC = np.average(small_pCC_array) print('Small C alpha carbon average Pearson correlation coefficient: ', small_avg_pCC) protein_listdir = 'C:/Users/Voltron Mini/Desktop/MLSummerStuff/Datasets/park_medium' protein_file_list = os.listdir(protein_listdir) atom_data = compile_data(protein_file_list, protein_listdir) medium_pCC_list = [] for index in range(len(atom_data)): C_data, N_data, O_data, S_data, CA_data, CNA_data = split_atom_data(atom_data[index])
import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression as LinReg X = np.array([147, 150, 153, 158, 160, 163, 168, 170, 173, 175, 178, 180, 183]).reshape(-1, 1) y = np.array([49, 50, 51, 54, 56, 58, 60, 62, 63, 64, 66, 67, 68]).reshape(-1, 1) X1 = np.hstack([np.ones((X.shape[0], 1)), X]) A = X1.T.dot(X1) b = X1.T.dot(y) w_fml = np.linalg.pinv(A).dot(b) print(w_fml[:, 0].tolist()) model = LinReg() model.fit(X, y) w_lib = [model.intercept_[0], model.coef_[0, 0]] print(w_lib) xv = np.array([145, 185]) yv = w_lib[0] + w_lib[1] * xv plt.plot(X[:, 0], y[:, 0], 'co') plt.plot(xv, yv, 'g:') plt.grid(alpha=0.5) plt.show()