Exemple #1
0
def regressTest(original, target, test, numTargetCols):

    numFrames = len(test)

    linReg = LinReg(fit_intercept=True)

    predictedThings = linReg.fit(original, target).predict(test)
    predictedThings = predictedThings.reshape(numFrames, numTargetCols)

    return predictedThings
def linear_fit_and_score(essay_set, vectorizer, name):

    #Get all the text from data
    train_essays = (vectorizer_train[vectorizer_train['essay_set'] ==
                                     essay_set])['essay'].values

    #Turn each text into an array of word counts
    train_vectors = vectorizer.fit_transform(train_essays).toarray()

    #Standardizing for y
    train_std_scores = np.asarray(
        (vectorizer_train[vectorizer_train['essay_set'] == essay_set]
         )['std_score'],
        dtype="|S6").astype(np.float)
    # print "\nStandardized Train Scores", train_std_scores[:5]

    ##############
    #   Linear   #
    ##############

    # Linear Model
    regr = LinReg(fit_intercept=False, copy_X=False)
    regr.fit(train_vectors, train_std_scores)

    valid_vectors = vectorizer.transform(
        (vectorizer_valid[vectorizer_valid['essay_set'] == essay_set]
         )['essay'].values).toarray()

    # My guess is we will want to denormalize these scores for quadratic weighted k
    valid_pred_std_scores = regr.predict(valid_vectors)

    print "Linear for Essay Set " + str(
        essay_set) + " with " + name + ":", Spearman(
            a=(valid_df[valid_df['essay_set'] == essay_set])["score"],
            b=valid_pred_std_scores)
    print "\n"
Exemple #3
0
def main():

    # leer csv
    df = pd.read_csv("resultados/compilado_mexico_temp_R.csv")

    df.sort_values(by="Año", inplace=True)

    # lons and lats
    lons = np.array(df["Long"])
    lats = np.array(df["Lat"])

    # ciclo para la generación de la predicción
    contador = 0

    # dataframe
    new_df = pd.DataFrame(
        columns="Long Lat Año Ene Feb Mar Abr May Jun Jul Ago Sept Oct Nov Dic"
        .split())
    for lon, lat in zip(lons, lats):
        # df temporal
        df_temporal = df.loc[(df["Long"] == lon) & (df["Lat"] == lat)]

        # ordenar valores por año
        grouped = df_temporal.groupby("Año").mean()

        # valores x y y
        X = grouped.index.values.reshape(-1, 1)
        y_Ene = grouped["Ene"].values
        y_Feb = grouped["Feb"].values
        y_Mar = grouped["Mar"].values
        y_Abr = grouped["Abr"].values
        y_May = grouped["May"].values
        y_Jun = grouped["Jun"].values
        y_Jul = grouped["Jul"].values
        y_Ago = grouped["Ago"].values
        y_Sept = grouped["Sept"].values
        y_Oct = grouped["Oct"].values
        y_Nov = grouped["Nov"].values
        y_Dic = grouped["Dic"].values

        # declarar la regresión
        reg_1 = LinReg()
        reg_2 = LinReg()
        reg_3 = LinReg()
        reg_4 = LinReg()
        reg_5 = LinReg()
        reg_6 = LinReg()
        reg_7 = LinReg()
        reg_8 = LinReg()
        reg_9 = LinReg()
        reg_10 = LinReg()
        reg_11 = LinReg()
        reg_12 = LinReg()

        # ajustar el modelo
        reg_1.fit(X, y_Ene)
        reg_2.fit(X, y_Feb)
        reg_3.fit(X, y_Mar)
        reg_4.fit(X, y_Abr)
        reg_5.fit(X, y_May)
        reg_6.fit(X, y_Jun)
        reg_7.fit(X, y_Jul)
        reg_8.fit(X, y_Ago)
        reg_9.fit(X, y_Sept)
        reg_10.fit(X, y_Oct)
        reg_11.fit(X, y_Nov)
        reg_12.fit(X, y_Dic)

        ene = []
        feb = []
        mar = []
        abr = []
        may = []
        jun = []
        jul = []
        ago = []
        sept = []
        octu = []
        nov = []
        dic = []
        new_longs = []
        new_lats = []
        anios = []

        # predicción
        i = 2018
        print("Long: {} Lat: {} Pred: {}".format(lon, lat, i))
        ene.append(reg_1.predict(i))
        feb.append(reg_2.predict(i))
        mar.append(reg_3.predict(i))
        abr.append(reg_4.predict(i))
        may.append(reg_5.predict(i))
        jun.append(reg_6.predict(i))
        jul.append(reg_7.predict(i))
        ago.append(reg_8.predict(i))
        sept.append(reg_9.predict(i))
        octu.append(reg_10.predict(i))
        nov.append(reg_11.predict(i))
        new_lats.append(lat)
        new_longs.append(lon)
        anios.append(i)

        contador += 1
        print(contador)

        d = {
            "Long": np.array(new_longs),
            "Lat": np.array(new_lats),
            "Año": np.array(anios),
            "Ene": np.array(ene),
            "Feb": np.array(feb),
            "Mar": np.array(mar),
            "Abr": np.array(abr),
            "May": np.array(may),
            "Jun": np.array(jun),
            "Jul": np.array(jul),
            "Ago": np.array(ago),
            "Sept": np.array(sept),
            "Oct": np.array(octu),
            "Nov": np.array(nov),
            "Dic": np.array(dic)
        }

        new_df = new_df.append(d, ignore_index=True)
    new_df.to_csv("resultados/predicciones_temp.csv")
Exemple #4
0
def main():

    print "Fetching data..."
    train_df = util.get_training_data('../data/training_set_rel3.tsv')
    valid_df = util.get_validation_data('../data/valid_set.tsv')

    print "Standardizing scores..."
    train_df, valid_df = util.append_standardized_column(
        train_df, valid_df, 'score')

    print "Calculating perplexity feature..."

    train_df, valid_df = Perplexity().fill_perplexity_columns(
        train_df, valid_df)

    print "Calculating number of sentences feature..."

    train_df, valid_df = fill_sentence_column(train_df, valid_df)

    print "Cleaning for spelling and word count..."
    # cleaned up data for spelling feature
    vectorizer_train_spelling = util.vectorizer_clean_spelling(train_df)
    train_essays_spelling = vectorizer_train_spelling['essay'].values
    vectorizer_valid_spelling = util.vectorizer_clean_spelling(valid_df)
    valid_essays_spelling = vectorizer_valid_spelling['essay'].values

    print "Calculating total words feature..."

    train_df, valid_df = fill_total_words_column(train_df, valid_df,
                                                 train_essays_spelling,
                                                 valid_essays_spelling)

    print "Calculating unique words feature..."

    train_df, valid_df = fill_unique_words_column(train_df, valid_df,
                                                  train_essays_spelling,
                                                  valid_essays_spelling)

    print "Calculating spelling feature..."
    # spelling feature
    train_df, valid_df = fill_spelling_column(train_df, valid_df,
                                              train_essays_spelling,
                                              valid_essays_spelling)

    print "Calculating pos tags features..."

    train_df, valid_df = fill_pos_columns(train_df, valid_df)

    print "Cleaning for TFIDF..."
    # cleaned up data for tfidf vector feature
    vectorizer_train = util.vectorizer_clean(train_df)
    train_essays = vectorizer_train['essay'].values
    vectorizer_valid = util.vectorizer_clean(valid_df)
    valid_essays = vectorizer_valid['essay'].values

    print "Calculating TFIDF features with unigram..."
    train_df, valid_df = fill_tfidf_column(train_df, valid_df, train_essays,
                                           valid_essays, 1)

    #print "Calculating TFIDF features with trigram..."
    #train_df, valid_df = fill_tfidf_column(train_df, valid_df, train_essays, valid_essays, 3)

    print train_df.head()

    print valid_df.head()

    COLS = [
        'essay_set', 'spelling_correct', 'std_sentence_count',
        'std_unique_words', 'std_total_words', 'std_unique_words', 'ADJ',
        'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRT', 'PRON', 'VERB', '.',
        'X', 'std_perplexity', 'std_score'
    ]

    train_df = train_df[COLS].join(train_df.filter(regex=("tfidf_*")))
    valid_df = valid_df[COLS].join(valid_df.filter(regex=("tfidf_*")))

    print train_df.shape
    print valid_df.shape

    max_essay_set = max(train_df['essay_set'])

    linreg_scores_df = pd.DataFrame(columns=['essay_set', 'p', 'spearman'])

    lasso_scores_df = pd.DataFrame(
        columns=['essay_set', 'alpha', 'p', 'spearman'])
    ridge_scores_df = pd.DataFrame(
        columns=['essay_set', 'alpha', 'p', 'spearman'])

    alphas = [x * 1.0 / 20 for x in range(20, 0, -1)]

    for i in range(1, max_essay_set + 1):

        print ""

        train_x = np.asarray((train_df[train_df['essay_set'] == i]).drop(
            ['essay_set', 'std_score'], axis=1))
        train_std_scores = np.asarray(
            (train_df[train_df['essay_set'] == i])['std_score'],
            dtype="|S6").astype(np.float)

        regr = LinReg(fit_intercept=False, copy_X=False)
        regr.fit(train_x, train_std_scores)

        valid_x = np.asarray((valid_df[valid_df['essay_set'] == i]).drop(
            ['essay_set', 'std_score'], axis=1))
        valid_pred_std_scores = regr.predict(valid_x)

        linreg_spear, p = Spearman(
            a=(valid_df[valid_df['essay_set'] == i])["std_score"],
            b=valid_pred_std_scores)
        linreg_scores_df = linreg_scores_df.append(
            {
                'essay_set': i,
                'p': p,
                'spearman': linreg_spear
            },
            ignore_index=True)

        print "Linear for Essay Set " + str(i) + ":", linreg_spear

        for a in alphas:
            ridge = linear_model.Ridge(alpha=a)
            ridge.fit(train_x, train_std_scores)
            valid_pred_std_scores_ridge = ridge.predict(valid_x)

            ridge_spear, p = Spearman(
                a=(valid_df[valid_df['essay_set'] == i])["std_score"],
                b=valid_pred_std_scores_ridge)
            ridge_scores_df = ridge_scores_df.append(
                {
                    'essay_set': i,
                    'alpha': a,
                    'p': p,
                    'spearman': ridge_spear
                },
                ignore_index=True)

            print "Alpha = " + str(a) + " Ridge for Essay Set " + str(
                i) + ":", ridge_spear

            lasso = linear_model.Lasso(alpha=a)
            lasso.fit(train_x, train_std_scores)
            valid_pred_std_scores_lasso = lasso.predict(valid_x)

            lasso_spear, p = Spearman(
                a=(valid_df[valid_df['essay_set'] == i])["std_score"],
                b=valid_pred_std_scores_lasso)
            lasso_scores_df = lasso_scores_df.append(
                {
                    'essay_set': i,
                    'alpha': a,
                    'p': p,
                    'spearman': lasso_spear
                },
                ignore_index=True)

            print "Alpha = " + str(a) + "Lasso for Essay Set " + str(
                i) + ":", lasso_spear

    print linreg_scores_df
    print ridge_scores_df
    print lasso_scores_df

    linreg_scores_df.to_pickle('linreg_scores-01.pickle')
    ridge_scores_df.to_pickle('ridge_scores-01.pickle')
    lasso_scores_df.to_pickle('lasso_scores-01.pickle')
Exemple #5
0
# plt.title("Yearly Average Land Temperature 1750-2015")
# plt.xlabel("Year")
# plt.ylabel("Yearly Average Land Temperature")
# plt.show()

# #print(df[times.year == 1752])
# print(df[np.isnan(df["LandAverageTemperature"])])
# df["LandAverageTemperature"] = df["LandAverageTemperature"].fillna(method="ffill")
# print(df[times.year == 1752])

# Model.
df = pd.read_csv("E:/OneDrive/Documents/python/test/GlobalTemperatures.csv")
df = df.ix[:, :2]
times = pd.DatetimeIndex(df["dt"])
grouped = df.groupby([times.year]).mean()
x = grouped.index.values.reshape(-1, 1)
y = grouped["LandAverageTemperature"].values

reg = LinReg()
reg.fit(x, y)
y_preds = reg.predict(x)
print('Accuracy: {}'.format(reg.score(x, y)))

plt.figure(figsize=(15, 5))
plt.title('Linear Regression')
plt.scatter(x=x, y=y_preds)
plt.scatter(x=x, y=y, c='r')
plt.show()

print('Predicted temperature in 2050 will be {}'.format(reg.predict(2050)[0]))
#COLS = ['std_sentence_count', 'essay_set', 'std_score']
#train_df = train_df[COLS].join(train_df.filter(regex=("tfidf_*")))
#valid_df = valid_df[COLS].join(valid_df.filter(regex=("tfidf_*")))

for i in range(1, max_essay_set + 1):

    #vectorizer_train = util.vectorizer_clean(train_df)
    train_x = np.asarray(
        (train_df[train_df['essay_set'] == i]).drop('std_score', axis=1))
    #train_x = np.asarray((train_df[train_df['essay_set'] == i])[['std_sentence_count']])
    train_std_scores = np.asarray(
        (train_df[train_df['essay_set'] == i])['std_score'],
        dtype="|S6").astype(np.float)

    regr = LinReg(fit_intercept=False, copy_X=False)
    regr.fit(train_x, train_std_scores)

    valid_x = np.asarray(
        (valid_df[valid_df['essay_set'] == i]).drop('std_score', axis=1))
    #valid_x = np.asarray((valid_df[valid_df['essay_set'] == i])[['std_sentence_count']])
    valid_pred_std_scores = regr.predict(valid_x)

    #print "Linear for Essay Set "+str(i)+":", Spearman(a = (valid_df[valid_df['essay_set'] == i])["std_score"], b = valid_pred_std_scores)
    #print "\n"

    alpha = [x * 1.0 / 20 for x in range(21)]
    ridge_scores = []
    lasso_scores = []
    for a in alpha:
        ridge = linear_model.Ridge(alpha=a)
Exemple #7
0
def main():
    """Read and display csv data from two existing files-population,temprature."""
    """https://www.kaggle.com/brajput24/world-population"""
    dfp = pd.read_csv('population.csv')
    displayheader('population.csv')
    dfp.index = dfp['Year']
    dft = pd.read_csv('GlobalLandTemperaturesByCountry.csv')
    displayheader('GlobalLandTemperaturesByCountry.csv')
    """ Request user input a country name to investigate"""
    cn = input("Enter country: ")
    if not cn:
        print("Must provide a country name.")
    cn = cn.title()
    """ Use the country user inputed as a filter for both population
        and temperature dataset."""

    maskp = dfp['Country'].isin([cn])
    """ Only show Value which is population qty."""
    dfp = dfp[maskp].filter(items=['Value'])
    print(dfp)
    print('\n')
    """Plot the population vs time gragh."""
    plt.figure()
    plt.plot(dfp)
    plt.title('Population vs Time')
    plt.xlabel('Year')
    plt.ylabel('Value')
    """ Display population data info."""
    showp = dfp.describe()
    print(showp)
    print('\n')
    """To examine two factors:p1-quantity increment of population;
        p2-ratio of population to last year's impacts on temperature change."""
    pl = []
    indarray = []
    datap1 = []
    datap2 = []
    """Store all the population data of each year in list pl."""
    for index, row in dfp.iterrows():
        pl.append(row['Value'])
    for i in range(51):

        p1 = pl[i + 1] - pl[i]
        p2 = pl[i + 1] / pl[i]
        datap1.append(p1)
        datap2.append(p2)
        indarray.append(i + 1960)
    """create two series based on two factors p1,p2."""
    s1_p = pd.Series(data=datap1, index=indarray)
    s2_p = pd.Series(data=datap2, index=indarray)
    """ Use the country user inputed as a filter for both population
        and temperature dataset."""
    maskt = dft['Country'].isin([cn])
    """Only show Average Temp."""
    dft = dft[maskt].filter(items=['AverageTemperature', 'dt'])
    """group months data into year and take mean of it as inspired by
       wk11 seminar Data Science with Python."""
    times = pd.DatetimeIndex(dft['dt'])
    dft = dft.fillna(method='ffill')
    group = dft.groupby([times.year]).mean()
    dft = group['AverageTemperature']
    dft = dft[dft.index > 1959]
    dft = dft[dft.index < 2013]
    print(dft)
    print('\n')
    """Plot the temperature vs time gragh."""
    plt.figure()
    plt.plot(dft)
    plt.title('AvgTemp vs Time')
    plt.xlabel('Time')
    plt.ylabel('Temp')
    showt = dft.describe()
    print(showt)

    tl = []
    indarray = []
    datat = []
    logistic_judge = []
    """Store all the temperature data of each year in list pl."""
    for index, row in dft.iteritems():
        tl.append(row)
    for i in range(51):

        ti = tl[i + 1] - tl[i]
        datat.append(ti)
        if ti > 0:
            logistic_judge.append(1)
        else:
            logistic_judge.append(0)
    """create index for judge series to use."""
    array = []
    for i in range(51):
        array.append(i + 1960)
    s_lrj = pd.Series(data=logistic_judge, index=array)
    """Combine two factors p1 and p2 and judge series into a dataframe lrdf."""
    lrdf = {
        'Population inc': s1_p,
        'Population ratio': s2_p,
        'Temp increased': s_lrj
    }
    lrdf = pd.DataFrame(lrdf)
    print('\n')
    '''Logistic regression analysis of dataframe lrdf-
       http://www.powerxing.com/logistic-regression-in-python/'''
    print(lrdf.head(8))
    print(lrdf.describe())
    print(
        pd.crosstab(lrdf['Temp increased'],
                    lrdf['Population inc'],
                    rownames=['Temp increased']))

    lrdf.hist()
    plt.show()

    lrdf['intercept'] = 1.0
    train_c = lrdf.columns[:-2]
    logit = sm.Logit(lrdf['Temp increased'], lrdf[train_c])
    result = logit.fit()
    print(result.summary())
    print('\n')
    """ LinearRegression analysis of population and temperature
    http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html"""
    x = dfp.index.values.reshape(-1, 1)
    y = dfp.values
    lreg = LinReg()
    lreg.fit(x, y)
    y_pred = lreg.predict(x)

    plt.figure()
    plt.title("Population Linear Regression")
    plt.scatter(x=x, y=y_pred)
    plt.scatter(x=x, y=y, c='r')
    print("Accuracy_Population: " + str(lreg.score(x, y)))
    print('\n')

    xt = dft.index.values.reshape(-1, 1)
    yt = dft.values
    lregt = LinReg()
    lregt.fit(xt, yt)
    yt_pred = lregt.predict(xt)

    plt.figure()
    plt.title("Temperature Linear Regression")
    plt.scatter(x=xt, y=yt_pred)
    plt.scatter(x=xt, y=yt, c='r')
    print("Accuracy_Temperature: " + str(lregt.score(xt, yt)))
Exemple #8
0
    return flex_index

#---------------------GATHERING RESULTS---------------------#
protein_listdir = 'C:/Users/Voltron Mini/Desktop/MLSummerStuff/Datasets/park_small'
protein_file_list = os.listdir(protein_listdir)
atom_data = compile_data(protein_file_list, protein_listdir)

small_pCC_list = []
for index in range(len(atom_data)):
    C_data, N_data, O_data, S_data, CA_data, CNA_data = split_atom_data(atom_data[index])
    
    mode_choice = 'CA' #Choose mode here
    X = compute_flex_index(atom_data[index], mode=mode_choice, kernel=['lorentz_ker', 'lorentz_ker', 'gaussian_ker'], \
                           kappa=[1,3,1], eta=[16,2,31])
    y = CA_data[:,3]
    reg = LinReg().fit(X,y)
    ypred = reg.predict(X)
    pCC = stats.pearsonr(ypred, y)[0]
    small_pCC_list.append(pCC)

small_pCC_array = np.array(small_pCC_list)
small_avg_pCC = np.average(small_pCC_array)
print('Small C alpha carbon average Pearson correlation coefficient: ', small_avg_pCC)

protein_listdir = 'C:/Users/Voltron Mini/Desktop/MLSummerStuff/Datasets/park_medium'
protein_file_list = os.listdir(protein_listdir)
atom_data = compile_data(protein_file_list, protein_listdir)

medium_pCC_list = []
for index in range(len(atom_data)):
    C_data, N_data, O_data, S_data, CA_data, CNA_data = split_atom_data(atom_data[index])
Exemple #9
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression as LinReg

X = np.array([147, 150, 153, 158, 160, 163, 168, 170, 173, 175, 178, 180,
              183]).reshape(-1, 1)
y = np.array([49, 50, 51, 54, 56, 58, 60, 62, 63, 64, 66, 67,
              68]).reshape(-1, 1)
X1 = np.hstack([np.ones((X.shape[0], 1)), X])

A = X1.T.dot(X1)
b = X1.T.dot(y)
w_fml = np.linalg.pinv(A).dot(b)
print(w_fml[:, 0].tolist())

model = LinReg()
model.fit(X, y)
w_lib = [model.intercept_[0], model.coef_[0, 0]]
print(w_lib)

xv = np.array([145, 185])
yv = w_lib[0] + w_lib[1] * xv
plt.plot(X[:, 0], y[:, 0], 'co')
plt.plot(xv, yv, 'g:')
plt.grid(alpha=0.5)
plt.show()