def execute(trial=False): startTime = datetime.datetime.now() client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('angelay_maulikjs', 'angelay_maulikjs') repo.dropPermanent('angelay_maulikjs') repo.createPermanent('angelay_maulikjs') data = repo.angelay_maulikjs.clean2012.find() D = [] for document in data: d = dict(document) D.append([ d['CarbonIntensity'], d['CO2Emissions'], d['EnergyIntensity'], d['EnergyUse'], d['GDPperCapita'], d['HDI'], d['Population'] ]) df = pd.DataFrame(D, columns=[ 'CarbonIntensity', 'CO2Emissions', 'EnergyIntensity', 'EnergyUse', 'GDPperCapita', 'HDI', 'Population' ]) axes = pd.plotting.scatter_matrix(df, alpha=1, figsize=(10, 10)) plt.tight_layout() plt.savefig('angelay_maulikjs/statsmodel_correlations') print('\nCorrelation coefficients:\n') print(df.corr()) print() print(df.describe()) print() Independents = df[[ 'CarbonIntensity', 'EnergyIntensity', 'EnergyUse', 'GDPperCapita', 'HDI', 'Population' ]] Dependent = df.CO2Emissions model = sm.OLS(Dependent, Independents) results = model.fit() print(results.summary()) print() print( '\nThis is our linear least-square model. It does not yield a good R-squared value. We are going to build our model by adding one variable at a time, starting with the variable that yields the highest R-squared value when fitted to a linear model against CO2 emissions. After doing some research, we found a theory called Kaya Identity which states that CO2 emissions is roughly equal to population * GDP per capita * energy intensity * carbon intensity. We will incorporate that into our model and see if adding energy use and HDI will make it better.\n' ) ind = [ 'CarbonIntensity', 'EnergyIntensity', 'EnergyUse', 'GDPperCapita', 'HDI', 'Population' ] for i in range(len(ind)): model = sm.OLS(Dependent, df[ind[i]]) results = model.fit() print('\nCO2Emissions vs ' + ind[i] + '\n') print(results.summary()) print( '\nLooks like population yields the highest R-squared value, and energy use comes next. Build a model with population and energy use and see if R-squared value goes up.\n' ) model = smf.ols(formula='CO2Emissions ~ Population * EnergyUse', data=df) results = model.fit() print('\nCO2Emissions vs Population + EnergyUse\n') print(results.summary()) print( '\nR-squared value went up to 0.954. Now adding Carbon Intensity.\n' ) model = smf.ols( formula='CO2Emissions ~ Population * EnergyUse * CarbonIntensity', data=df) results = model.fit() print('\nCO2Emissions vs Population * EnergyUse * CarbonIntensity\n') print(results.summary()) print( '\nR-squared value went up to 0.982. Now adding GDP per capita.\n') model = smf.ols( formula= 'CO2Emissions ~ Population * EnergyUse * CarbonIntensity * GDPperCapita', data=df) results = model.fit() print( '\nCO2Emissions vs Population * EnergyUse * CarbonIntensity * GDPperCapita\n' ) print(results.summary()) print('\nR-squared value went up to 0.992. Now adding HDI.\n') model = smf.ols( formula= 'CO2Emissions ~ Population * EnergyUse * CarbonIntensity * GDPperCapita * HDI', data=df) results = model.fit() print( '\nCO2Emissions vs Population * EnergyUse * CarbonIntensity * GDPperCapita * HDI\n' ) print(results.summary()) print( '\nR-squared value went up to 0.995. Now adding Energy Intensity.\n' ) model = smf.ols( formula= 'CO2Emissions ~ Population * EnergyUse * CarbonIntensity * GDPperCapita * HDI * EnergyIntensity', data=df) results = model.fit() print( '\nCO2Emissions vs Population * EnergyUse * CarbonIntensity * GDPperCapita * HDI * EnergyIntensity\n' ) print(results.summary()) #print(results.params) print( '\nR-squared value went up to 0.998. This is a really high R-squared value. We might be at risk of overfitting the data. Lets test our model on the 2013 data and see how we do.\n' ) # getting the 2013 data data2 = repo.angelay_maulikjs.clean2013.find() D2 = [] for document in data2: d = dict(document) D2.append([ d['CarbonIntensity'], d['CO2Emissions'], d['EnergyIntensity'], d['EnergyUse'], d['GDPperCapita'], d['HDI'], d['Population'] ]) df2 = pd.DataFrame(D2, columns=[ 'CarbonIntensity', 'CO2Emissions', 'EnergyIntensity', 'EnergyUse', 'GDPperCapita', 'HDI', 'Population' ]) # Testing the model on 2013 data model = smf.ols( formula= 'CO2Emissions ~ Population * EnergyUse * CarbonIntensity * GDPperCapita * HDI * EnergyIntensity', data=df) pred = model.fit().predict(df2) R2 = r2_score(df2.CO2Emissions, pred) print('\nResults on 2013 Data\n') fig, ax = plt.subplots() x2 = range(len(df2.index)) ax.plot(x2, df2.CO2Emissions, 'o', label="Data") ax.plot(x2, pred, 'r', label="OLS prediction") ax.legend(loc="best") plt.savefig('angelay_maulikjs/statsmodel_results_without_outliers') print( '\nThe R-squared value is %f, this is a pretty good R-squared value, and it means that our model does pretty well at predicting future values. Now lets train the model on all 2012 data with ourliers and test it on all 2013 data with outliers.\n' % R2) # getting all 2012 data with outliers data3 = repo.angelay_maulikjs.all2012.find() D3 = [] for document in data3: d = dict(document) D3.append([ d['CarbonIntensity'], d['CO2Emissions'], d['EnergyIntensity'], d['EnergyUse'], d['GDPperCapita'], d['HDI'], d['Population'] ]) df3 = pd.DataFrame(D3, columns=[ 'CarbonIntensity', 'CO2Emissions', 'EnergyIntensity', 'EnergyUse', 'GDPperCapita', 'HDI', 'Population' ]) # Training the model on all 2012 data model = smf.ols( formula= 'CO2Emissions ~ Population * EnergyUse * CarbonIntensity * GDPperCapita * HDI * EnergyIntensity', data=df3) results = model.fit() print( '\nCO2Emissions vs Population * EnergyUse * CarbonIntensity * GDPperCapita * HDI * EnergyIntensity\n' ) print(results.summary()) #print(results.params) print( '\nWe got an R-squared value of 1, but we might be overfitting the data. Lets test the model on all 2013 data to see how we do.\n' ) # getting all 2013 data with outliers data4 = repo.angelay_maulikjs.all2013.find() D4 = [] for document in data4: d = dict(document) D4.append([ d['CarbonIntensity'], d['CO2Emissions'], d['EnergyIntensity'], d['EnergyUse'], d['GDPperCapita'], d['HDI'], d['Population'] ]) df4 = pd.DataFrame(D4, columns=[ 'CarbonIntensity', 'CO2Emissions', 'EnergyIntensity', 'EnergyUse', 'GDPperCapita', 'HDI', 'Population' ]) # Testing the model on all 2013 data pred = model.fit().predict(df4) R2 = r2_score(df4.CO2Emissions, pred) print('\nResults on All 2013 Data\n') fig, ax = plt.subplots() x1 = range(len(df4.index)) ax.plot(x1, df4.CO2Emissions, 'o', label="Data") ax.plot(x1, pred, 'r', label="OLS prediction") ax.legend(loc="best") plt.savefig('angelay_maulikjs/statsmodel_results_with_outliers') print( '\nThe R-squared value is %f, even higher than the one we got from clean data without outliers. We can be pretty confident about our model being able to predict future values now.\n' % R2) endTime = datetime.datetime.now() return {"Start ": startTime, "End ": endTime}
def execute(trial=False): startTime = datetime.datetime.now() client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('nathansw_rooday_sbajwa_shreyap', 'nathansw_rooday_sbajwa_shreyap') # Read data from mongo mbta_db = repo['nathansw_rooday_sbajwa_shreyap.OTP_by_line'] stops_db = repo['nathansw_rooday_sbajwa_shreyap.stops'] stopsVsLines_db = repo['nathansw_rooday_sbajwa_shreyap.stopsVsLines'] # Read data into pandas print("Loading OTP Data") otpData = mbta_db.find_one() del otpData['_id'] otp_by_line = pd.DataFrame.from_dict(otpData) otp_by_line = otp_by_line.transpose() otp_by_line[otp_by_line['Peak Service'] == ''] = np.nan otp_by_line[otp_by_line['Off-Peak Service'] == ''] = np.nan print("Loading Stops Data") stopsData = stops_db.find_one() del stopsData['_id'] stops = pd.DataFrame.from_dict(stopsData) print("Loading Stops By Line Data") stop_by_line_data = stopsVsLines_db.find_one() del stop_by_line_data['_id'] stop_by_line = pd.DataFrame([(key, x) for key, val in stop_by_line_data.items() for x in val], columns=['Name', 'Values']) stop_by_line.columns = ['Route', 'Stop'] stop_by_line = stop_by_line.set_index('Stop') print("Joining Stops By Line with Stops") stop_route_neighborhood = stop_by_line.join(stops.set_index('stop_id'), how='left') stop_route_neighborhood = stop_route_neighborhood[ stop_route_neighborhood['neighborhood'].notnull()] stop_route_neighborhood['stop_id'] = stop_route_neighborhood.index merged = pd.merge(otp_by_line, stop_route_neighborhood, left_index=True, right_on='Route', how='right') merged_stop = merged[merged['Peak Service'].notnull()] print("Creating dummy data") stop_dummy_city = pd.get_dummies(merged_stop['city']) stop_dummy_neighborhood = pd.get_dummies(merged_stop['neighborhood']) merged_dummy_city = merged_stop.join(stop_dummy_city) merged_dummy_city_final = merged_dummy_city.groupby('Route').max() x_cols = merged_dummy_city_final.columns[8:] y_cols = 'Off-Peak Service' print("Creating regression model") model = sm.GLM(merged_dummy_city_final[y_cols], merged_dummy_city_final[x_cols], family=sm.families.Gaussian()) results = model.fit() resultsKeys = results.params.keys() coefficients = {} for key in resultsKeys: coefficients[key] = results.params[key] pp = pprint.PrettyPrinter(indent=4) pp.pprint(coefficients) print("Saving coefficients") repo.dropCollection('regressionAnalysis') repo.createCollection('regressionAnalysis') repo['nathansw_rooday_sbajwa_shreyap.regressionAnalysis'].insert_one( coefficients) print("Done!") repo.logout() endTime = datetime.datetime.now() return {"start": startTime, "end": endTime}
def doKmeans(X, nclust): model = KMeans(nclust) model.fit(X) clust_labels = model.predict(X) cent = model.cluster_centers_ return (clust_labels, cent)
def execute(trial = False): startTime = datetime.datetime.now() client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('aditid_benli95_teayoon_tyao', 'aditid_benli95_teayoon_tyao') all_crime = repo.aditid_benli95_teayoon_tyao.crimesPerNumberOfEstablishment.find() x = [] y = [] print(all_crime) for crime in all_crime: crimeDict = dict(crime) x.append(crimeDict["_id"]) y.append(crimeDict["value"]["crimes"]) sumy = sum(y) all = [] for i in range(0,len(x)): for j in range(0,int(y[i]/21.5)): #253075 / 11800 = 21.5 for nomalization all.append(x[i]) drug_crime = repo.aditid_benli95_teayoon_tyao.drugCrimesPerNumberOfEstablishment.find() a = [] b = [] for crime in drug_crime: crimeDict = dict(crime) a.append(crimeDict["_id"]) b.append(crimeDict["value"]["crimes"]) drug = [] for i in range(0,len(a)): for j in range(0,int(b[i])): drug.append(a[i]) bins = [] for k in range(0,250,5): bins.append(k) pyplot.hist(all, bins, alpha=.5, label='All Crimes') pyplot.hist(drug, bins, alpha=.5, label='Drug Crimes') # plt.hist(all, bins, alpha=.7, color='blue') # plt.hist(drug, bins, alpha=.7, color='red') pyplot.xlabel("Establisments") pyplot.ylabel("Crimes") pyplot.legend(loc='upper left') pyplot.show() import statsmodels.api as sm model = sm.OLS(y, x) results2 = model.fit() print (results2.summary()) print ("Confidence Intervals:", results.conf_int()) print ("Parameters:", results2.params) model = sm.OLS(b, a) results2 = model.fit() print (results2.summary()) print ("Confidence Intervals:", results.conf_int()) print ("Parameters:", results2.params) endTime = datetime.datetime.now() return {"Start ":startTime, "End ":endTime}
def execute(trial=False): '''Retrieve some data sets (not using the API here for the sake of simplicity).''' startTime = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('liweixi_mogujzhu', 'liweixi_mogujzhu') repo.dropCollection("prediction_weather_incident") repo.createCollection("prediction_weather_incident") # Create the training data and target data_name = 'liweixi_mogujzhu.weather_fire_incident_transformation' data = pd.DataFrame(list(repo[data_name].find())) print(data.shape) # If trial mode, use half of the data for training if trial: data = data[:data.shape[0] // 2] data['LSCORE'] = data['NINCIDENT'] data['TDIFF'] = data["TMAX"] - data["TMIN"] X = data[["TAVG", "TDIFF", "PRCP", "SNOW", "AWND"]] y = data["LSCORE"].astype(float) # Scale the data to range [0,1] min_max_scaler = MinMaxScaler() x_scaled = numpy.array(min_max_scaler.fit_transform(X.values)) y_scaled_value = numpy.array( min_max_scaler.fit_transform(y.values.reshape(-1, 1))) kbd = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile') y_scaled = kbd.fit_transform(y_scaled_value) # Shuffle the data and create the training set and testing set X_shuffled, y_shuffled, y_shuffled_scaled_value = shuffle( x_scaled, y_scaled, y_scaled_value) X_train = X_shuffled[:int(X.shape[0] * 0.8)] y_train = y_shuffled[:int(X.shape[0] * 0.8)].ravel() y_train_value = y_shuffled_scaled_value[:int(X.shape[0] * 0.8)].ravel() X_test = X_shuffled[int(X.shape[0] * 0.8):] y_test = y_shuffled[int(X.shape[0] * 0.8):].ravel() y_test_value = y_shuffled_scaled_value[int(X.shape[0] * 0.8):].ravel() # Ser up the classifiers. We use 7 different classifier in this case classifiers = [ linear_model.SGDClassifier(), linear_model.LogisticRegression(), svm.SVC(), ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.RandomForestClassifier(), ensemble.GradientBoostingClassifier() ] for item in classifiers: print(item) clf = item clf.fit(X_train, y_train) print("Training accuracy:", clf.score(X_train, y_train), "Base: 0.33") print("Testing accuracy:", clf.score(X_test, y_test), "Base: 0.33") insert_data = pd.DataFrame() model = svm.SVC(probability=True) model.fit(X_train, y_train) print("Final Classifer", model) pred = model.predict_proba(X_test) pred_label = model.predict(X_test) print("Accuracy", model.score(X_test, y_test)) insert_data["LOW_PROB"] = pred[:, 0] insert_data["MID_PROB"] = pred[:, 1] insert_data["HIGH_PROB"] = pred[:, 2] insert_data["PRED_LABEL"] = pd.DataFrame(pred_label).replace( 0.0, "LOW").replace(1.0, "MID").replace(2.0, "HIGH") insert_data["TRUE_LABEL"] = pd.DataFrame(y_test).replace( 0.0, "LOW").replace(1.0, "MID").replace(2.0, "HIGH") insert_data["TRUE_VALUE"] = y_test_value print(insert_data) repo['liweixi_mogujzhu.prediction_weather_incident'].insert_many( insert_data.to_dict('records')) repo['liweixi_mogujzhu.prediction_weather_incident'].metadata( {'complete': True}) print(repo['liweixi_mogujzhu.prediction_weather_incident'].metadata()) repo.logout() endTime = datetime.datetime.now() return {"start": startTime, "end": endTime}
def execute(trial=False): '''Retrieves our data sets from Boston Open Data using specific URLs. Creates the necessary pymongo collections within our repo database.''' startTime = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('esaracin', 'esaracin') # Support for Trial mode: if (trial == True): # Skip all but 1 percent of each collection dataset = repo['esaracin.crime_incidents'].find().skip( repo['esaracin.crime_incidents'].count() - 2393) datasetFIO = repo['esaracin.fio_data'].find().skip( repo['esaracin.crime_incidents'].count() - 1522) else: dataset = repo['esaracin.crime_incidents'].find() datasetFIO = repo['esaracin.fio_data'].find() df_crime = pd.DataFrame(list(dataset)) df_fios = pd.DataFrame(list(datasetFIO)) url = 'http://datamechanics.io/data/district_racial_composition.csv' df_race = pd.read_csv(url) # Now we need to find the number of crimes/fios for each Policing # District. crime_by_district = {dist: 0 for dist in df_crime['DISTRICT'].unique()} del crime_by_district['A15'] del crime_by_district['A1'] crime_by_district['A1/A15'] = 0 for index, row in df_crime.iterrows(): if row['DISTRICT'] == 'A1' or row['DISTRICT'] == 'A15': crime_by_district['A1/A15'] += 1 else: crime_by_district[row['DISTRICT']] += 1 # Replace former crime DataFrame with this new, filtered, data. # Use it to join df_race on the District field. df_crime = pd.DataFrame.from_dict(crime_by_district, orient='index') df_race = df_race.join(df_crime, on='dist') new_columns = df_race.columns.values new_columns[-1] = 'Crime Count' df_race.columns = new_columns # Similarly compute the number of FIOs in each District. # Join this with our growing df_race table as well. fios_by_district = {dist: 0 for dist in df_race['dist']} for index, row in df_fios.iterrows(): if (row['DIST'] in fios_by_district): fios_by_district[row['DIST']] += 1 elif (row['DIST'] == 'A1' or row['DIST'] == 'A15'): fios_by_district['A1/A15'] += 1 df_fios = pd.DataFrame.from_dict(fios_by_district, orient='index') df_race = df_race.join(df_fios, on='dist') new_columns = df_race.columns.values new_columns[-1] = 'FIO Count' df_race.columns = new_columns # Normalize crime count and FIO count by population of district. for index, row in df_race.iterrows(): df_race.ix[index, 'Crime Count'] /= row['population'] df_race.ix[index, 'FIO Count'] /= row['population'] # Now drop the categorical data before the regression to_insert = df_race.to_json(orient='records') # Save to insert later districts = df_race['dist'] df_race = df_race.drop('dist', axis=1).drop('dist_name', axis=1) df_race = df_race.drop('population', axis=1) # Run regression with the number of crimes in each area as the output # attribute. y_train = df_race['FIO Count'] X_train = df_race.drop('FIO Count', axis=1) model = sm.OLS(y_train, X_train) results = model.fit() outfile = open('Linear_Reg_Results.txt', 'w') print(results.summary(), file=outfile) outfile.close() # Insert our race dataset. r = json.loads(to_insert) repo.dropCollection("race_data") repo.createCollection("race_data") repo['esaracin.race_data'].insert_many(r) repo['esaracin.race_data'].metadata({'complete': True}) print(repo['esaracin.race_data'].metadata()) repo.logout() endTime = datetime.datetime.now() return {"start": startTime, "end": endTime}