def fetch_estate(): database = torndb.Connection(**dbutil.get_mysql_config()) urls = [] for result in database.query('select distinct communityId from house'): estate_id = result.communityId urls.append('http://www.iwjw.com/estate/%s/' % estate_id) master = Master(rest_period=5, result_model='html', result_dir='../iwjw/estate') fetcher = Fetcher(processor=ps.Processor_hn()) master.add_fetchers(fetcher) master.start(urls)
def fetch_house_from_db(): print 'sales' existed = set([f.replace('.html', '') for f in os.listdir('../iwjw/sale')]) master = Master(rest_period=5, result_model='html', result_dir='../iwjw/sale') fetcher = Fetcher(processor=ps.Processor_hn()) master.add_fetchers(fetcher) database = torndb.Connection(**dbutil.get_mysql_config()) sale_list = database.query('select houseId from house where type=1;') sale_list = [result.houseId for result in sale_list if not result.houseId in existed] sale_list = ['http://www.iwjw.com/sale/%s/' % hid for hid in sale_list] master.start(sale_list) database.close()
def fetch_house(): # print 'sales' # master = Master(rest_period=5, result_model='html', result_dir='../iwjw/sale') # fetcher = Fetcher(processor=ps.Processor_hn()) # master.add_fetchers(fetcher) # sales = list(get_houses('../iwjw/sale_list', 'sale')) # master.start(sales) print 'rent' master = Master(rest_period=5, result_model='html', result_dir='../iwjw/rent') fetcher = Fetcher(processor=ps.Processor_hn()) master.add_fetchers(fetcher) rents = list(get_houses('../iwjw/rent_list', 'chuzu')) master.start(rents)
def fetch_list(): print 'sale_list' master = Master(rest_period=5, result_model='html', result_dir='../iwjw/sale_list') fetcher = Fetcher(processor=ps.ProcessorIwjw()) master.add_fetchers(fetcher) urls = [line.split('#')[0].strip() for line in codecs.open('../district.id')] urls = map(lambda x: 'http://www.iwjw.com/sale/shanghai/%sp1/' % x, urls) master.start(urls) print 'rent_list' master = Master(rest_period=5, result_model='html', result_dir='../iwjw/rent_list') fetcher = Fetcher(processor=ps.ProcessorIwjw()) master.add_fetchers(fetcher) urls = [line.split('#')[0].strip() for line in codecs.open('../district.id')] urls = map(lambda x: 'http://www.iwjw.com/chuzu/shanghai/%sp1/' % x, urls) master.start(urls)