def populateData(radius, actualValue): #select all zipcodes from table query = "select distinct(zip_code) FROM dddm.plant_locations order by zip_code asc" data = pd.read_sql(query, md.connect()) allzipList = data['zip_code'].tolist() print(len(allzipList)) #select inserted zipcodes query2 = "SELECT distinct(zip) FROM dddm.model_data" q2data = pd.read_sql(query2, md.connect()) storedZip = q2data['zip'].tolist() print(len(storedZip)) for zipcode in allzipList: if not zipcode in storedZip: updatedZip = str(int(zipcode)) if len(str(int(zipcode))) == 3: updatedZip = str(0) + str(0) + updatedZip elif len(str(int(zipcode))) == 4: updatedZip = str(0) + updatedZip model.addToTable(updatedZip, radius, actualValue) print("Processed " + updatedZip) #populateData(50, 'Y')
def test(): #fetch all the zipcodes for project query = "SELECT Zip FROM dddm.test_zips" zip_df = pd.read_sql(query, md.connect()) zip_list = zip_df['Zip'].tolist() #build a dataframe and push to table df = pd.DataFrame(columns=['zip','seaport','landprice','oilreserve', 'existingplants','disasters','railroad', 'populationdensity', 'elevation', 'water', 'weather', 'rules','earthquake','prediction']) for zipcode in zip_list: result = app.app(zipcode, 50) listData = pd.DataFrame([[zipcode, result.prediction_df['seaport'], result.prediction_df['landprice'], result.prediction_df['oilreserve'], result.prediction_df['existingplants'], result.prediction_df['disasters'], result.prediction_df['railroad'], result.prediction_df['populationdensity'], result.elevation_data, result.water_data, result.weather_data, result.rules, result.earthquake_data, result.prediction]], columns=['zip','seaport','landprice','oilreserve', 'existingplants','disasters','railroad', 'populationdensity', 'elevation', 'water', 'weather', 'rules','earthquake','prediction']) df = df.append(listData, ignore_index=True) df.to_sql(name='test_data', con=md.connect(), if_exists='append', index=False)
def import_water_data(file_name): data = pd.read_csv('resources/' + file_name) data = data[[ 'MonitoringLocationTypeName', 'LatitudeMeasure', 'LongitudeMeasure' ]] #data.to_sql(name='water_locations', con=dbEngine, if_exists = 'replace') md.create_table(md.connect(), data, 'water_locations')
def import_weather(file_name): df_weather = pd.read_csv("resources/" + file_name, low_memory=False) df_weather = df_weather[[ 'StationName', 'Date', 'ObsType', 'Value', 'S-Flag', 'City', 'State' ]] md.create_table(md.connect(), df_weather, 'weather_observations')
def populateData(radius, actualValue): #select all zipcodes from table query = "select Zip FROM dddm.test_zips" data = pd.read_sql(query, md.connect()) allzipList = data['Zip'].tolist() print(len(allzipList)) #select inserted zipcodes #query2 = "SELECT distinct(zip) FROM dddm.test_zip_data" #q2data = pd.read_sql(query2, md.connect()) #storedZip = q2data['zip'].tolist() #print(len(storedZip)) for zipcode in allzipList: updatedZip = str(int(zipcode)) if len(str(int(zipcode))) == 3: updatedZip = str(0) + str(0) + updatedZip elif len(str(int(zipcode))) == 4: updatedZip = str(0) + updatedZip model.addToTestTable(updatedZip, radius, actualValue) print("Processed " + updatedZip) #populateData(50, 'None')
def main(file_name): initial = pd.read_csv('resources/' + file_name) initial['city'] = initial['city'].str.replace(' ', '').str.upper() seperatedZips = (initial['zip'].str.strip()).str.split(expand=True) fullData = pd.concat([initial, seperatedZips], axis=1) fullData = fullData.drop(['zip'], axis=1) idvars = [ 'city', 'state_id', 'state_name', 'county_name', 'lat', 'lng', 'population' ] allZips = pd.melt(fullData, id_vars=idvars, value_name='zip') allZips = allZips.drop(['variable'], axis=1) # Drops columns with missing zip code values allZips = allZips[pd.notnull(allZips.zip)] #engine = create_engine('mysql+pymysql://pythonUser:abc@localhost:3306/dddm?charset=utf8', encoding='utf-8') #allZips.to_sql(name='zip_lookup', con=engine, if_exists = 'replace') md.create_table(md.connect(), allZips, 'zip_lookup') """ Zip Code lookup table complete, Ready to be joined """ """
def import_oil_reserve(name): page = requests.get(name) soup = BeautifulSoup(page.content, 'lxml') table = soup.find('table', attrs={'class': 'data1'}) rows = table.findAll('tr', attrs={'class': 'DataRow'}) values = [] for tr in rows: state = tr.find('td', attrs={'class': 'DataStub1'}).get_text() otherYear = tr.findAll('td', attrs={'class': 'DataB'}) y11 = otherYear[0].get_text() y12 = otherYear[1].get_text() y13 = otherYear[2].get_text() y14 = otherYear[3].get_text() y15 = otherYear[4].get_text() current = tr.find('td', attrs={'class': 'Current2'}).get_text() values.append(OilReserveData(state, y11, y12, y13, y14, y15, current)) df = pd.DataFrame.from_records([s.to_dict() for s in values]) md.create_table(md.connect(), df, 'oil_reserve')
def import_land_prices(file_name): df_landprices = pd.read_excel('resources/' + file_name, skiprows=[0], parse_cols="A,B,C,D,E,H,I") df_landprices = df_landprices.loc[df_landprices['Date'] == '2015Q4'] df_landprices['MSA'] = df_landprices.MSA.str.replace(' ', '') #df_landprices.to_sql(name='land_prices', con=dbEngine, index=False, if_exists = 'replace') md.create_table(md.connect(), df_landprices, 'land_prices')
def import_seaports(file_name): df_ports = pd.read_csv("resources/" + file_name, low_memory=False) df_ports = df_ports[[ 'LATITUDE1', 'LONGITUDE1', 'CITY_OR_TO', 'STATE_POST', 'ZIPCODE', 'PORT_NAME' ]] #df_ports.to_sql(name='seaports', con=dbEngine, if_exists = 'replace') md.create_table(md.connect(), df_ports, 'seaports')
def build_adaboost_model(): datafull = pd.read_sql_table('model_data', md.connect()) data = datafull[[ 'seaport', 'landprice', 'oilreserve', 'existingplants', 'disasters', 'railroad', 'populationdensity', 'elevation' ]] target = datafull[['actual']] x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=3) cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=3) AB_clf = AdaBoostClassifier(n_estimators=500, random_state=3) AB_clf.fit(x_train, y_train['actual']) AB_predicted = cross_val_predict(AB_clf, data, target['actual']) AB_scores = cross_val_score(AB_clf, data, target.values.ravel(), cv=cv) AB_confusion = confusion_matrix(target, AB_predicted) model_names.append('AdaBoost') train_accuracy = accuracy_score(y_train, AB_clf.predict(x_train)) train_accuracies.append(train_accuracy) test_accuracy = accuracy_score(y_test, AB_clf.predict(x_test)) test_accuracies.append(test_accuracy) cv_score = accuracy_score(target, AB_predicted) cv_scores.append(cv_score) print("AdaBoost Classifier Model: ") print(" Score of {} for training set: {:.4f}.".format( AB_clf.__class__.__name__, train_accuracy)) print(" Score of {} for test set: {:.4f}.".format( AB_clf.__class__.__name__, test_accuracy)) print(" Cross validation score: %0.2f (+/- %0.2f)" % (AB_scores.mean(), AB_scores.std() * 2)) print(" Predicted values accuracy: %0.2f" % (accuracy_score(target, AB_predicted))) plot_confusion_matrix(AB_confusion, class_names) plt.show() print(classification_report(target, AB_predicted)) print() importances = AB_clf.feature_importances_ indices = np.argsort(importances)[::-1] print('Feature Ranking: ') for i in range(0, 6): print("{} feature no.{} ({})".format(i + 1, indices[i], importances[indices[i]])) return AB_clf
def import_existing_plants(file_name): plant_locations = pd.read_csv('resources/' + file_name) plant_locations = plant_locations[[ 'Facility Name', 'Deregistered (Yes/No)', 'City', 'State', 'Zip Code', 'Parent Company', 'Latitude', 'Longitude', 'Number of RMP Submissions' ]] plant_locations = plant_locations[plant_locations.State.notnull()] #plant_locations.to_sql(name='plant_locations', con=dbEngine, if_exists = 'replace') md.create_table(md.connect(), plant_locations, 'plant_locations')
def build_gaussian_model(): #class_names = ['No', 'Yes'] datafull = pd.read_sql_table('model_data', md.connect()) data = datafull[[ 'seaport', 'landprice', 'oilreserve', 'existingplants', 'disasters', 'railroad', 'populationdensity', 'elevation' ]] target = datafull[['actual']] x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=3) cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=3) GNB_clf = GaussianNB() GNB_clf.fit(x_train, y_train['actual']) GNB_predicted = cross_val_predict(GNB_clf, data, target['actual']) GNB_scores = cross_val_score(GNB_clf, data, target.values.ravel(), cv=cv) GNB_confusion = confusion_matrix(target, GNB_predicted) model_names.append('GaussianNB') train_accuracy = accuracy_score(y_train, GNB_clf.predict(x_train)) train_accuracies.append(train_accuracy) test_accuracy = accuracy_score(y_test, GNB_clf.predict(x_test)) test_accuracies.append(test_accuracy) cv_score = accuracy_score(target, GNB_predicted) cv_scores.append(cv_score) print("GaussianNB Model: ") print(" Score of {} for training set: {:.4f}.".format( GNB_clf.__class__.__name__, train_accuracy)) print(" Score of {} for test set: {:.4f}.".format( GNB_clf.__class__.__name__, test_accuracy)) print(" Cross validation score: %0.2f (+/- %0.2f)" % (GNB_scores.mean(), GNB_scores.std() * 2)) print(" Predicted values accuracy: %0.2f" % (cv_score)) #use in slideshow, not in demo plot_confusion_matrix(GNB_confusion, class_names) plt.show() print(classification_report(target, GNB_predicted)) print() return GNB_clf
def fetch_weather_data(zipcode): engine = md.connect() zipcode = str(int(zipcode)) #print(zipcode) query1 = "SELECT * FROM dddm.zip_lookup where zip = '" + zipcode + "' OR zip='0" + zipcode +"' OR zip = '00" + zipcode +"'" zip_data = pd.read_sql(query1, engine) state = str(zip_data['state_id']) query2 = "SELECT * FROM dddm.weather_observations where State = '" + state + "'" weather_data = pd.read_sql(query2, engine) return weather_data
def import_earthquakes(): df_earthquakes = pd.read_csv("resources/USEarthquakes.csv", low_memory=False) df_earthquakes = df_earthquakes[[ 'time', 'latitude', 'longitude', 'mag', 'magType', 'place' ]] df_earthquakes2 = pd.read_csv("resources/AKEarthquakes.csv", low_memory=False) df_earthquakes2 = df_earthquakes2[[ 'time', 'latitude', 'longitude', 'mag', 'magType', 'place' ]] df_earthquakes.append(df_earthquakes2) md.create_table(md.connect(), df_earthquakes, 'earthquake_data')
def build_random_forest_model(): datafull = pd.read_sql_table('model_data', md.connect()) data = datafull[[ 'seaport', 'landprice', 'oilreserve', 'existingplants', 'disasters', 'railroad', 'populationdensity', 'elevation' ]] target = datafull[['actual']] x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=3) cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=3) RF_clf = RandomForestClassifier(n_estimators=200, random_state=3) RF_clf.fit(x_train, y_train['actual'].values.ravel()) RF_predicted = cross_val_predict(RF_clf, data, target['actual']) RF_scores = cross_val_score(RF_clf, data, target, cv=cv) RF_confusion = confusion_matrix(target, RF_predicted) model_names.append('RandomForest') train_accuracy = accuracy_score(y_train, RF_clf.predict(x_train)) train_accuracies.append(train_accuracy) test_accuracy = accuracy_score(y_test, RF_clf.predict(x_test)) test_accuracies.append(test_accuracy) cv_score = accuracy_score(target, RF_predicted) cv_scores.append(cv_score) print("Random Forest Classifier Model: ") print(" Score of {} for training set: {:.4f}.".format( RF_clf.__class__.__name__, train_accuracy)) print(" Score of {} for test set: {:.4f}.".format( RF_clf.__class__.__name__, test_accuracy)) print(" Cross validation score: %0.2f (+/- %0.2f)" % (RF_scores.mean(), RF_scores.std() * 2)) print(" Predicted values accuracy: %0.2f" % (accuracy_score(target, RF_predicted))) plot_confusion_matrix(RF_confusion, class_names) plt.show() print(classification_report(target, RF_predicted)) print() return RF_clf
def import_rules(file_name): data = open('resources/' + file_name).read() sent = sent_tokenize(data) #print(sent) #clean sentence and filter based on keywords keywords = ['well', 'wells', 'oil', 'drill', 'drilled'] sentence_list = [] for sentence in sent: sentence = sentence.replace("\n", "") for word in keywords: if word in sentence: sentence_list.append(sentence) break d = {'rule': sentence_list} df = pd.DataFrame(data=d) df.to_sql('rules', md.connect(), schema='dddm')
def normalize_all(): engine = md.connect() #normalize land prices df = pd.read_sql_table('land_prices_final', engine) df['home_value_norm'] = (df['Home Value'] - df['Home Value'].min()) / ( df['Home Value'].max() - df['Home Value'].min()) df['structure_cost_norm'] = ( df['Structure Cost'] - df['Structure Cost'].min()) / ( df['Structure Cost'].max() - df['Structure Cost'].min()) md.create_table(engine, df, 'land_prices_final') #normalize oil reserve data df = pd.read_sql_table('oil_reserve_final', engine) df['year16'] = df['year16'].str.replace(',', '').astype(float) df['year16_norm'] = (df['year16'] - df['year16'].min()) / ( df['year16'].max() - df['year16'].min()) md.create_table(engine, df, 'oil_reserve_final') #normalize disaster data df = pd.read_sql_table('disaster_data_final', engine) df['NumFireReferences_norm'] = ( df['NumFireReferences'] - df['NumFireReferences'].min()) / ( df['NumFireReferences'].max() - df['NumFireReferences'].min()) df['NumFloodReferences_norm'] = ( df['NumFloodReferences'] - df['NumFloodReferences'].min()) / ( df['NumFloodReferences'].max() - df['NumFloodReferences'].min()) df['NumHurricaneReferences_norm'] = ( df['NumHurricaneReferences'] - df['NumHurricaneReferences'].min()) / ( df['NumHurricaneReferences'].max() - df['NumHurricaneReferences'].min()) md.create_table(engine, df, 'disaster_data_final') #normalize railroad data df = pd.read_sql_table('railroad_data_final', engine) df['Tons_norm'] = (df['Tons'] - df['Tons'].min()) / (df['Tons'].max() - df['Tons'].min()) md.create_table(engine, df, 'railroad_data_final') #normalize population density data df = pd.read_sql_table('population_density', engine) df['density_norm'] = (df['density'] - df['density'].min()) / ( df['density'].max() - df['density'].min()) md.create_table(engine, df, 'population_density_final')
def fetch_earthquake_data(zipcode): engine = md.connect() #fetch latitude and longitude for zipcode zipcode = str(int(zipcode)) #print(zipcode) query1 = "SELECT * FROM dddm.zip_lookup where zip = '" + zipcode + "' OR zip='0" + zipcode +"' OR zip = '00" + zipcode +"' LIMIT 0,1" zip_data = pd.read_sql(query1, engine) coord = 1.0 lat_range1 = str(int(zip_data['lat']) + coord) lat_range2 = str(int(zip_data['lat']) - coord) lng_range1 = str(int(zip_data['lng']) + coord) lng_range2 = str(int(zip_data['lng']) - coord) query2 = "SELECT * from dddm.earthquake_data where latitude BETWEEN '" + lat_range2 + "' and '" + lat_range1 + "' AND longitude BETWEEN '" \ + lng_range2 + "' and '" + lng_range1 + "'" earthquake_data = pd.read_sql(query2, engine) return earthquake_data
def fetch_water_data(zipcode): engine = md.connect() zipcode = str(int(zipcode)) #print(zipcode) query1 = "SELECT * FROM dddm.zip_lookup where zip = '" + zipcode + "' OR zip='0" + zipcode +"' OR zip = '00" + zipcode +"' LIMIT 0,1" zip_data = pd.read_sql(query1, engine) coord = 0.5 lat_range1 = str(int(zip_data['lat']) + coord) lat_range2 = str(int(zip_data['lat']) - coord) lng_range1 = str(int(zip_data['lng']) + coord) lng_range2 = str(int(zip_data['lng']) - coord) query2 = "SELECT * FROM dddm.water_locations where LatitudeMeasure BETWEEN '" \ + str(lat_range2) + "' and '" + str(lat_range1) + "' AND LongitudeMeasure BETWEEN '" \ + str(lng_range2) + "' and '" + str(lng_range1) + "'" water_data = pd.read_sql(query2, engine) return water_data
def fetch_all_zip(): query = "SELECT zip FROM dddm.test_zip_data_all" zip_df = pd.read_sql(query, md.connect()) zip_list = zip_df['zip'].tolist() return zip_list
# -*- coding: utf-8 -*- """ Created on Sun Apr 8 16:21:48 2018 @author: Cameron """ import pandas as pd import mysqlConnection as md df = pd.read_csv('resources/population_density.csv', sep=",") md.create_table(md.connect(), df, 'population_density')
def read_state_codes(file_name): state_code = pd.read_csv('resources/' + file_name, sep='|') print(state_code.head(5)) md.create_table(md.connect(), state_code, 'state_codes')
# -*- coding: utf-8 -*- """ Created on Thu Apr 12 17:29:35 2018 @author: Cameron """ import mysqlConnection as md import pandas as pd def add_all_data(engine): df = pd.read_csv('exported_data/model_data.csv') df.to_sql(name='model_data', con=engine, if_exists='append', index=False) def add_test_data(engine): df = pd.read_csv('exported_data/test_data.csv') df.to_sql(name='test_zip_data', con=engine, if_exists='append', index=False) """ Comment out the data already imported """ engine = md.connect() add_all_data(engine) add_test_data(engine)
southdakota = len(lines) hursd = len(hurlines) floodsd = len(floodlines) states = [ 'California', 'Louisiana', 'New York', 'Alaska', 'Texas', 'North Carolina', 'Ohio', 'Massachusetts', 'Utah', 'South Dakota' ] firelengths = [ california, louisiana, newyork, alaska, texas, northcarolina, ohio, massachusetts, utah, southdakota ] hurlengths = [ hurca, hurla, hurny, hurak, hurtx, hurnc, huroh, hurma, hurut, hursd ] floodlengths = [ floodca, floodla, floodny, floodak, floodtx, floodnc, floodoh, floodma, floodut, floodsd ] statecodes = ['CA', 'LA', 'NY', 'AK', 'TX', 'NC', 'OH', 'MA', 'UT', 'SD'] df_disaster = pd.DataFrame({ 'State': states, 'StateCode': statecodes, 'NumFireReferences': firelengths, 'NumHurricaneReferences': hurlengths, 'NumFloodReferences': floodlengths }) md.create_table(md.connect(), df_disaster, 'disaster_data')
def fetch_rules(): engine = md.connect() query = "SELECT * FROM dddm.rules where rule like '%%General%%'" rules_data = pd.read_sql(query, engine) return rules_data['rule'].iloc[0]
def import_smaller_zips(file_name): data = pd.read_csv('resources/' + file_name) data = data[['City', 'State', 'Zip']] #data.to_sql(name='water_locations', con=dbEngine, if_exists = 'replace') md.create_table(md.connect(), data, 'test_zips')
def connect(): return md.connect()
def import_land_prices(file_name): df_landprices = pd.read_excel('resources/' + file_name) md.create_table(md.connect(), df_landprices, 'railroad_data')
# -*- coding: utf-8 -*- """ Created on Thu Apr 12 13:29:55 2018 @author: Beth """ import pandas as pd import mysqlConnection as md df = pd.read_csv('resources/BadZipCodes.csv', sep=",") md.create_table(md.connect(), df, 'unfavorable_zipcodes')