Example #1
0
def main(argv):
    """Main entry-point for scraper application.

    Arguments:
        argv: list of command line arguments

    argv must contain 3 arguments:
        [0]: standard, name of the file
        [1]: city
        [2]: state
        [3]: run option (search, scrape, or both)

    Raises:
        ValueError: Must provide city, state, and run option
    """
    if len(argv) != 4:
        raise ValueError('Must provide city, state, and ruun option')

    city = argv[1]
    state = argv[2]
    run_option = argv[3]

    zipcodes = get_zips(city, state)
    mongoclient = get_mongoclient()

    if run_option == 'search':
        run_search(city, zipcodes, mongoclient)

    if run_option == 'scrape':
        scrape_content(zipcodes, mongoclient)

    if run_option == 'both':
        run_both(city, zipcodes, mongoclient)
Example #2
0
def main(argv):
    """Main entry-point for batchclean."""
    remove = False
    if len(argv) > 1 and argv[1] == 'remove':
        remove = True

    logger = get_configured_logger('DEBUG', __name__)

    mongoclient = get_mongoclient()
    logger.info('Retrieved mongoclient.')

    listings = queryforlistings(mongoclient)
    logger.info('Found {} listings.'.format(listings.count()))

    if remove:
        removed = findremoved(listings)
        logger.info('{} listings have been removed, deleting from DB.'.format(
            len(removed)))
        removelistings(mongoclient, removed)

    listings = queryforlistings(mongoclient)
    attrs, units = cleanlistings(listings)
    logger.info('Observed {} unique attributes while cleaning.'.format(
        len(attrs)))
    logger.info('Processed {} units'.format(len(units)))
    writeunitstomongo(mongoclient, units)
    writeattrstomongo(mongoclient, attrs)
Example #3
0
def main():
    mongoclient = get_mongoclient()
    listing_collection = mongoclient.scraper.listing
    listings = listing_collection.find()
    for listing in listings:
        query = {'_id': listing['_id']}
        update = {'$set': {'content_parsed': False}}
        listing_collection.update_one(query, update)

    unit_collection = mongoclient.scraper.unit
    unit_collection.drop()
def main():
    mongoclient = get_mongoclient()
    preprocessor = Preprocessor(mongoclient)

    X = preprocessor.getfeatures()
    y = preprocessor.getlabels()

    model = RandomForestRegressor(n_estimators=1000,
                                  criterion='mae',
                                  n_jobs=36,
                                  verbose=2)
    evaluator = Evaluator(model, X, y)
    evaluator.evaluate()
    print('Ran on {} units...'.format(X.shape[0]))
    print(evaluator.results())
Example #5
0
def gettrainingdata():
    mongoclient = get_mongoclient()
    units = mongoclient.scraper.unit.find()
    units_list = list(units)
    df = pd.DataFrame(units_list)
    df.fillna(False, inplace=True)
    df = df[(df['price'] < 15000) & (df['price'] > 0) & (df['sqft'] != 0) & (df['sqft'] < 7500)]

    exclude_list = [
        '_id',
        'description',
        'listing_id',
        'price',
        'title',
        'zipcode'
    ]
    features = list(set(df.columns) - set(exclude_list))
    X = df[features]
    y = df['price']
    return X, y
    def getlabels(self):
        y = self.df['price']
        return y

    def _applyfilters(self):
        self.df = self.df[(self.df['price'] < self.maxprice)
                          & (self.df['price'] > 0) & (self.df['sqft'] != 0) &
                          (self.df['sqft'] < self.maxsqft)]
        if self.zipcode:
            self.df = self.df[self.df['zipcode'] == self.zipcode]

    def _makedataframe(self):
        df = pd.DataFrame(list(self.units))
        df.fillna(False, inplace=True)
        self.df = df

    def _makefeatures(self):
        transforms = [textstate.flesch_kincaid_grade]

        # apply transforms
        transformsdf = self.df.transform(transforms)

        # join the results
        self.df.join(transformsdf)


if __name__ == '__main__':
    mongoclient = get_mongoclient()
    preprocessor = Preprocessor(mongoclient)
    preprocessor.makedataframe()