Example #1
0
    def save(self, create_or_append):
        '''
        Saves scraped data to database and to DataFrame pickle. User can choose whether
        to append to or overwrite the database table, but the DataFrame pickle will
        always be overwritten

        Args:
            create_or_append - 'create' or 'append'. Applies only to database table.
            DataFrame pickle will always be overwritten (string)
        '''

        # Pickle DataFrame as extra backup)
        pickle.dump(self.df, open('my_df.pickle', "w"))

        # Write to database
        if create_or_append == 'create':
            ut.drop_if_exists(self.conn, self.table_name)
            ut.prepare_table_w_textcols(self.df, self.table_name, self.conn,
                                        ['body', 'title'])
        elif create_or_append == 'append':
            pass
        else:
            raise ValueError("Please provide 'create' or 'append'")
        sql.to_sql(self.df,
                   self.table_name,
                   self.conn,
                   flavor="mysql",
                   if_exists="append")

        # Remove duplicate
        cur = self.conn.cursor()
        cmd = "ALTER TABLE " + self.table_name + " ADD UNIQUE INDEX(miles,price,year)"
        print(cmd)
        #cur.execute(cmd)
        print(self.df)
Example #2
0
    def save(self, create_or_append):
        '''
        Saves scraped data to database and to DataFrame pickle. User can choose whether
        to append to or overwrite the database table, but the DataFrame pickle will
        always be overwritten

        Args:
            create_or_append - 'create' or 'append'. Applies only to database table.
            DataFrame pickle will always be overwritten (string)
        '''

        # Pickle DataFrame as extra backup
        pickle.dump(self.df, open('my_df.pickle', "w"))

        # Write to database
        if create_or_append == 'create':
            ut.drop_if_exists(self.conn, self.table_name)
            ut.prepare_table_w_textcols(
                self.df,
                self.table_name,
                self.conn,
                ['body',
                 'title'])
        elif create_or_append == 'append':
            pass
        else:
            raise ValueError("Please provide 'create' or 'append'")
        sql.write_frame(
            self.df,
            self.table_name,
            self.conn,
            flavor="mysql",
            if_exists="append")


        # Remove duplicate
        cur = self.conn.cursor()
        cmd = "ALTER IGNORE TABLE " + self.table_name + " ADD UNIQUE INDEX(miles,price,year)"
        print(cmd)
        cur.execute(cmd)
        print(self.df)
Example #3
0
def main():

    # Select all cars of desired models from database
    models = {'accord', 'camry', 'civic', 'corolla'}
    conn = MySQLdb.connect(
        user="******",
        passwd="",
        db="carsdb",
        cursorclass=MySQLdb.cursors.DictCursor)
    read_table_name = "scraped"
    cmd = "SELECT model, year, miles, price, url, body, title, date FROM " + \
        read_table_name + \
        " WHERE area='sfbay' AND model in " + str(tuple(models))
    full = pd.io.sql.read_frame(cmd, conn)

    # UTF-8 encoding
    full['body'] = full['body'].apply(make_unicode)
    full['title'] = full['title'].apply(make_unicode)

    # Exclude outliers
    full = exclude_uni_outliers(full, 'year', 1996, 2013)
    full = exclude_uni_outliers(full, 'miles', 1000, 210000)
    full = exclude_uni_outliers(full, 'price', 1000, 50000)
    full = exclude_biv_outliers(full, 'year', 'miles')

    # Only show most recent posts on DealSpotter
    full = full.sort('date', ascending=False)
    num_on_web = 150
    full['on_web'] = [True if i <
                      num_on_web else False for i in range(0, len(full))]

    # Initialize DataFrame to keep track of savings
    delta_frame = DataFrame(columns=['url', 'delta'])

    # Loop through models (accord, camry, etc) and grow delta_frame
    for i, model in enumerate(models):
        print(model)

        # This model's subset of full dataframe
        df = full[full['model'] == model]
        df = df[['price', 'year', 'miles', 'url', 'date', 'on_web']]
        on_web = df['on_web']  # keep track of indices
        not_on_web = df['on_web'] == False  # keep track of indices

        # All training should be on cars not shown on DealSpotter
        feature_names = ['year', 'miles']
        features = df.ix[not_on_web, feature_names].values
        target = df.ix[not_on_web, 'price'].values

        predictor = RandomForestRegressor(
            n_estimators=100,
            min_samples_split=20)

        # If user just wants to do cross-validation on training data
        if sys.argv[1] == 'xval':
            # Exclude true test cases from cross-validation
            (train_idcs, test_idcs) = ut.get_xval_indcs(len(features), .8)
            predictor.fit(features[train_idcs,:], target[train_idcs])
            predictions = np.array(predictor.predict(features[test_idcs,:]))
            print('MAE = ' + str(get_mae(predictions, target[test_idcs])))

        # If user wants to make predictions for real test cars, shown on DealSpotter
        elif sys.argv[1] == 'real':

            # Extract true test data for DealSpotter
            web_features = df.ix[on_web, feature_names].values
            web_target = df.ix[on_web, 'price'].values

            # Fit model, make predictions
            predictor.fit(features, target)
            predictions = np.array(predictor.predict(web_features))
            delta = predictions - web_target
            model_delta_frame = DataFrame(
                {'url': df.ix[on_web, 'url'].values, 'delta': delta})
            delta_frame = delta_frame.append(model_delta_frame)

        else:
            raise ValueError("Please provide 'xval' or 'real'")

    # If user wants to make predictions for real test cars, shown on DealSpotter
    if sys.argv[1] == 'real':
        print(delta_frame)

        # Merge savings information with original data frame
        full = full.merge(delta_frame, on='url', how='inner')

        # Write to database
        write_table_name = 'priced'
        ut.drop_if_exists(conn, write_table_name)
        ut.prepare_table_w_textcols(
            full, write_table_name, conn, ['body', 'title'])
        pd.io.sql.write_frame(
            full,
            write_table_name,
            conn,
            flavor="mysql",
            if_exists="append")
Example #4
0
def main():

    # Select all cars of desired models from database
    models = {'accord', 'camry', 'civic', 'silverado'}
    conn = MySQLdb.connect(user="******",
                           passwd="",
                           db="carsdb",
                           cursorclass=MySQLdb.cursors.DictCursor)
    read_table_name = "scraped"
    cmd = "SELECT model, year, miles, price, url, body, title, date FROM " + \
        read_table_name + \
        " WHERE area='sfbay' AND model in " + str(tuple(models))
    full = pd.io.sql.read_sql(cmd, con=engine)
    print full

    # UTF-8 encoding
    #full['body'] = full['body'].apply(make_unicode)
    #full['title'] = full['title'].apply(make_unicode)

    # Exclude outliers
    #full = exclude_uni_outliers(full, 'year', 1996, 2013)
    #full = exclude_uni_outliers(full, 'miles', 1000, 210000)
    #full = exclude_uni_outliers(full, 'price', 1000, 50000)
    #full = exclude_biv_outliers(full, 'year', 'miles')
    print "we are fine till removing outliers"

    # Only show most recent posts on DealSpotter
    full = full.sort('date', ascending=False)
    num_on_web = 150
    full['on_web'] = [
        True if i < num_on_web else False for i in range(0, len(full))
    ]
    print full

    # Initialize DataFrame to keep track of savings
    delta_frame = DataFrame(columns=['url', 'delta'])

    # Loop through models (accord, camry, etc) and grow delta_frame
    for i, model in enumerate(models):
        print(model)

        # This model's subset of full dataframe
        df = full[full['model'] == model]
        df = df[['price', 'year', 'miles', 'url', 'date', 'on_web']]
        on_web = df['on_web']  # keep track of indices
        not_on_web = df['on_web'] == False  # keep track of indices

        # All training should be on cars not shown on DealSpotter
        #feature_names = ['year', 'miles']
        #features = df.ix[not_on_web, feature_names].values
        #target = df.ix[not_on_web, 'price'].values

        feature_names = ['year', 'miles']
        features = df.ix[on_web, feature_names].values
        target = df.ix[on_web, 'price'].values
        print "These are the features::::::::::::::::::::"
        print features

        predictor = RandomForestRegressor(n_estimators=100,
                                          min_samples_split=20)

        # If user just wants to do cross-validation on training data
        if sys.argv[1] == 'xval':
            # Exclude true test cases from cross-validation
            (train_idcs, test_idcs) = ut.get_xval_indcs(len(features), .8)
            predictor.fit(features[train_idcs, :], target[train_idcs])
            predictions = np.array(predictor.predict(features[test_idcs, :]))
            print('MAE = ' + str(get_mae(predictions, target[test_idcs])))

        # If user wants to make predictions for real test cars, shown on DealSpotter
        elif sys.argv[1] == 'real':

            # Extract true test data for DealSpotter
            web_features = df.ix[on_web, feature_names].values
            web_target = df.ix[on_web, 'price'].values

            # Fit model, make predictions
            predictor.fit(features, target)
            predictions = np.array(predictor.predict(web_features))
            delta = predictions - web_target
            model_delta_frame = DataFrame({
                'url': df.ix[on_web, 'url'].values,
                'delta': delta
            })
            delta_frame = delta_frame.append(model_delta_frame)

        else:
            raise ValueError("Please provide 'xval' or 'real'")

    # If user wants to make predictions for real test cars, shown on DealSpotter
    if sys.argv[1] == 'real':
        print(delta_frame)
        print "inside the secoind real arguemtn"
        # Merge savings information with original data frame
        full = full.merge(delta_frame, on='url', how='inner')

        # Write to database
        write_table_name = 'priced'
        ut.drop_if_exists(engine, write_table_name)
        ut.prepare_table_w_textcols(full, write_table_name, engine,
                                    ['body', 'title'])
        pd.io.sql.to_sql(full,
                         write_table_name,
                         engine,
                         index=False,
                         if_exists="append")