def save(self, create_or_append): ''' Saves scraped data to database and to DataFrame pickle. User can choose whether to append to or overwrite the database table, but the DataFrame pickle will always be overwritten Args: create_or_append - 'create' or 'append'. Applies only to database table. DataFrame pickle will always be overwritten (string) ''' # Pickle DataFrame as extra backup) pickle.dump(self.df, open('my_df.pickle', "w")) # Write to database if create_or_append == 'create': ut.drop_if_exists(self.conn, self.table_name) ut.prepare_table_w_textcols(self.df, self.table_name, self.conn, ['body', 'title']) elif create_or_append == 'append': pass else: raise ValueError("Please provide 'create' or 'append'") sql.to_sql(self.df, self.table_name, self.conn, flavor="mysql", if_exists="append") # Remove duplicate cur = self.conn.cursor() cmd = "ALTER TABLE " + self.table_name + " ADD UNIQUE INDEX(miles,price,year)" print(cmd) #cur.execute(cmd) print(self.df)
def save(self, create_or_append): ''' Saves scraped data to database and to DataFrame pickle. User can choose whether to append to or overwrite the database table, but the DataFrame pickle will always be overwritten Args: create_or_append - 'create' or 'append'. Applies only to database table. DataFrame pickle will always be overwritten (string) ''' # Pickle DataFrame as extra backup pickle.dump(self.df, open('my_df.pickle', "w")) # Write to database if create_or_append == 'create': ut.drop_if_exists(self.conn, self.table_name) ut.prepare_table_w_textcols( self.df, self.table_name, self.conn, ['body', 'title']) elif create_or_append == 'append': pass else: raise ValueError("Please provide 'create' or 'append'") sql.write_frame( self.df, self.table_name, self.conn, flavor="mysql", if_exists="append") # Remove duplicate cur = self.conn.cursor() cmd = "ALTER IGNORE TABLE " + self.table_name + " ADD UNIQUE INDEX(miles,price,year)" print(cmd) cur.execute(cmd) print(self.df)
def main(): # Select all cars of desired models from database models = {'accord', 'camry', 'civic', 'corolla'} conn = MySQLdb.connect( user="******", passwd="", db="carsdb", cursorclass=MySQLdb.cursors.DictCursor) read_table_name = "scraped" cmd = "SELECT model, year, miles, price, url, body, title, date FROM " + \ read_table_name + \ " WHERE area='sfbay' AND model in " + str(tuple(models)) full = pd.io.sql.read_frame(cmd, conn) # UTF-8 encoding full['body'] = full['body'].apply(make_unicode) full['title'] = full['title'].apply(make_unicode) # Exclude outliers full = exclude_uni_outliers(full, 'year', 1996, 2013) full = exclude_uni_outliers(full, 'miles', 1000, 210000) full = exclude_uni_outliers(full, 'price', 1000, 50000) full = exclude_biv_outliers(full, 'year', 'miles') # Only show most recent posts on DealSpotter full = full.sort('date', ascending=False) num_on_web = 150 full['on_web'] = [True if i < num_on_web else False for i in range(0, len(full))] # Initialize DataFrame to keep track of savings delta_frame = DataFrame(columns=['url', 'delta']) # Loop through models (accord, camry, etc) and grow delta_frame for i, model in enumerate(models): print(model) # This model's subset of full dataframe df = full[full['model'] == model] df = df[['price', 'year', 'miles', 'url', 'date', 'on_web']] on_web = df['on_web'] # keep track of indices not_on_web = df['on_web'] == False # keep track of indices # All training should be on cars not shown on DealSpotter feature_names = ['year', 'miles'] features = df.ix[not_on_web, feature_names].values target = df.ix[not_on_web, 'price'].values predictor = RandomForestRegressor( n_estimators=100, min_samples_split=20) # If user just wants to do cross-validation on training data if sys.argv[1] == 'xval': # Exclude true test cases from cross-validation (train_idcs, test_idcs) = ut.get_xval_indcs(len(features), .8) predictor.fit(features[train_idcs,:], target[train_idcs]) predictions = np.array(predictor.predict(features[test_idcs,:])) print('MAE = ' + str(get_mae(predictions, target[test_idcs]))) # If user wants to make predictions for real test cars, shown on DealSpotter elif sys.argv[1] == 'real': # Extract true test data for DealSpotter web_features = df.ix[on_web, feature_names].values web_target = df.ix[on_web, 'price'].values # Fit model, make predictions predictor.fit(features, target) predictions = np.array(predictor.predict(web_features)) delta = predictions - web_target model_delta_frame = DataFrame( {'url': df.ix[on_web, 'url'].values, 'delta': delta}) delta_frame = delta_frame.append(model_delta_frame) else: raise ValueError("Please provide 'xval' or 'real'") # If user wants to make predictions for real test cars, shown on DealSpotter if sys.argv[1] == 'real': print(delta_frame) # Merge savings information with original data frame full = full.merge(delta_frame, on='url', how='inner') # Write to database write_table_name = 'priced' ut.drop_if_exists(conn, write_table_name) ut.prepare_table_w_textcols( full, write_table_name, conn, ['body', 'title']) pd.io.sql.write_frame( full, write_table_name, conn, flavor="mysql", if_exists="append")
def main(): # Select all cars of desired models from database models = {'accord', 'camry', 'civic', 'silverado'} conn = MySQLdb.connect(user="******", passwd="", db="carsdb", cursorclass=MySQLdb.cursors.DictCursor) read_table_name = "scraped" cmd = "SELECT model, year, miles, price, url, body, title, date FROM " + \ read_table_name + \ " WHERE area='sfbay' AND model in " + str(tuple(models)) full = pd.io.sql.read_sql(cmd, con=engine) print full # UTF-8 encoding #full['body'] = full['body'].apply(make_unicode) #full['title'] = full['title'].apply(make_unicode) # Exclude outliers #full = exclude_uni_outliers(full, 'year', 1996, 2013) #full = exclude_uni_outliers(full, 'miles', 1000, 210000) #full = exclude_uni_outliers(full, 'price', 1000, 50000) #full = exclude_biv_outliers(full, 'year', 'miles') print "we are fine till removing outliers" # Only show most recent posts on DealSpotter full = full.sort('date', ascending=False) num_on_web = 150 full['on_web'] = [ True if i < num_on_web else False for i in range(0, len(full)) ] print full # Initialize DataFrame to keep track of savings delta_frame = DataFrame(columns=['url', 'delta']) # Loop through models (accord, camry, etc) and grow delta_frame for i, model in enumerate(models): print(model) # This model's subset of full dataframe df = full[full['model'] == model] df = df[['price', 'year', 'miles', 'url', 'date', 'on_web']] on_web = df['on_web'] # keep track of indices not_on_web = df['on_web'] == False # keep track of indices # All training should be on cars not shown on DealSpotter #feature_names = ['year', 'miles'] #features = df.ix[not_on_web, feature_names].values #target = df.ix[not_on_web, 'price'].values feature_names = ['year', 'miles'] features = df.ix[on_web, feature_names].values target = df.ix[on_web, 'price'].values print "These are the features::::::::::::::::::::" print features predictor = RandomForestRegressor(n_estimators=100, min_samples_split=20) # If user just wants to do cross-validation on training data if sys.argv[1] == 'xval': # Exclude true test cases from cross-validation (train_idcs, test_idcs) = ut.get_xval_indcs(len(features), .8) predictor.fit(features[train_idcs, :], target[train_idcs]) predictions = np.array(predictor.predict(features[test_idcs, :])) print('MAE = ' + str(get_mae(predictions, target[test_idcs]))) # If user wants to make predictions for real test cars, shown on DealSpotter elif sys.argv[1] == 'real': # Extract true test data for DealSpotter web_features = df.ix[on_web, feature_names].values web_target = df.ix[on_web, 'price'].values # Fit model, make predictions predictor.fit(features, target) predictions = np.array(predictor.predict(web_features)) delta = predictions - web_target model_delta_frame = DataFrame({ 'url': df.ix[on_web, 'url'].values, 'delta': delta }) delta_frame = delta_frame.append(model_delta_frame) else: raise ValueError("Please provide 'xval' or 'real'") # If user wants to make predictions for real test cars, shown on DealSpotter if sys.argv[1] == 'real': print(delta_frame) print "inside the secoind real arguemtn" # Merge savings information with original data frame full = full.merge(delta_frame, on='url', how='inner') # Write to database write_table_name = 'priced' ut.drop_if_exists(engine, write_table_name) ut.prepare_table_w_textcols(full, write_table_name, engine, ['body', 'title']) pd.io.sql.to_sql(full, write_table_name, engine, index=False, if_exists="append")