def main(argv): """Main entry-point for scraper application. Arguments: argv: list of command line arguments argv must contain 3 arguments: [0]: standard, name of the file [1]: city [2]: state [3]: run option (search, scrape, or both) Raises: ValueError: Must provide city, state, and run option """ if len(argv) != 4: raise ValueError('Must provide city, state, and ruun option') city = argv[1] state = argv[2] run_option = argv[3] zipcodes = get_zips(city, state) mongoclient = get_mongoclient() if run_option == 'search': run_search(city, zipcodes, mongoclient) if run_option == 'scrape': scrape_content(zipcodes, mongoclient) if run_option == 'both': run_both(city, zipcodes, mongoclient)
def main(argv): """Main entry-point for batchclean.""" remove = False if len(argv) > 1 and argv[1] == 'remove': remove = True logger = get_configured_logger('DEBUG', __name__) mongoclient = get_mongoclient() logger.info('Retrieved mongoclient.') listings = queryforlistings(mongoclient) logger.info('Found {} listings.'.format(listings.count())) if remove: removed = findremoved(listings) logger.info('{} listings have been removed, deleting from DB.'.format( len(removed))) removelistings(mongoclient, removed) listings = queryforlistings(mongoclient) attrs, units = cleanlistings(listings) logger.info('Observed {} unique attributes while cleaning.'.format( len(attrs))) logger.info('Processed {} units'.format(len(units))) writeunitstomongo(mongoclient, units) writeattrstomongo(mongoclient, attrs)
def main(): mongoclient = get_mongoclient() listing_collection = mongoclient.scraper.listing listings = listing_collection.find() for listing in listings: query = {'_id': listing['_id']} update = {'$set': {'content_parsed': False}} listing_collection.update_one(query, update) unit_collection = mongoclient.scraper.unit unit_collection.drop()
def main(): mongoclient = get_mongoclient() preprocessor = Preprocessor(mongoclient) X = preprocessor.getfeatures() y = preprocessor.getlabels() model = RandomForestRegressor(n_estimators=1000, criterion='mae', n_jobs=36, verbose=2) evaluator = Evaluator(model, X, y) evaluator.evaluate() print('Ran on {} units...'.format(X.shape[0])) print(evaluator.results())
def gettrainingdata(): mongoclient = get_mongoclient() units = mongoclient.scraper.unit.find() units_list = list(units) df = pd.DataFrame(units_list) df.fillna(False, inplace=True) df = df[(df['price'] < 15000) & (df['price'] > 0) & (df['sqft'] != 0) & (df['sqft'] < 7500)] exclude_list = [ '_id', 'description', 'listing_id', 'price', 'title', 'zipcode' ] features = list(set(df.columns) - set(exclude_list)) X = df[features] y = df['price'] return X, y
def getlabels(self): y = self.df['price'] return y def _applyfilters(self): self.df = self.df[(self.df['price'] < self.maxprice) & (self.df['price'] > 0) & (self.df['sqft'] != 0) & (self.df['sqft'] < self.maxsqft)] if self.zipcode: self.df = self.df[self.df['zipcode'] == self.zipcode] def _makedataframe(self): df = pd.DataFrame(list(self.units)) df.fillna(False, inplace=True) self.df = df def _makefeatures(self): transforms = [textstate.flesch_kincaid_grade] # apply transforms transformsdf = self.df.transform(transforms) # join the results self.df.join(transformsdf) if __name__ == '__main__': mongoclient = get_mongoclient() preprocessor = Preprocessor(mongoclient) preprocessor.makedataframe()