def get_houses(dir, type='sale'): global district, logger house = re.compile(r'http://www.iwjw.com/%s/([^/]+)/\?from' % type) district_re = re.compile(r'searchName="(.+)"') type_mapping = { 'sale': 1, 'chuzu': 0 } for f in os.listdir(dir): try: district_id = district.findall(f)[0] except: logger.error('district id failure#%s' % f) continue with codecs.open(os.path.join(dir, f), encoding='utf-8') as content: content = content.read() try: district_name = district_re.findall(content)[0].strip() except: district_name = None logger.error('district name failure#%s' % district_id) house_ids = set(house.findall(content)) database = torndb.Connection(**dbutil.get_mysql_config()) dbutil.update_district(database, district_id, district_name, type_mapping.get(type), house_ids) database.close() for id in house_ids: yield 'http://www.iwjw.com/%s/%s/' % (type, id)
def process_estates(): database = torndb.Connection(**dbutil.get_mysql_config()) path_template = '../iwjw/estate/%s.html' eids = map(lambda x: x.communityId, database.query('select distinct communityId from house;')) for eid in eids: try: process1estate(database, eid, path_template) except Exception, e: logger.error('%s#%s' % (eid, e))
def fetch_estate(): database = torndb.Connection(**dbutil.get_mysql_config()) urls = [] for result in database.query('select distinct communityId from house'): estate_id = result.communityId urls.append('http://www.iwjw.com/estate/%s/' % estate_id) master = Master(rest_period=5, result_model='html', result_dir='../iwjw/estate') fetcher = Fetcher(processor=ps.Processor_hn()) master.add_fetchers(fetcher) master.start(urls)
def fetch_house_from_db(): print 'sales' existed = set([f.replace('.html', '') for f in os.listdir('../iwjw/sale')]) master = Master(rest_period=5, result_model='html', result_dir='../iwjw/sale') fetcher = Fetcher(processor=ps.Processor_hn()) master.add_fetchers(fetcher) database = torndb.Connection(**dbutil.get_mysql_config()) sale_list = database.query('select houseId from house where type=1;') sale_list = [result.houseId for result in sale_list if not result.houseId in existed] sale_list = ['http://www.iwjw.com/sale/%s/' % hid for hid in sale_list] master.start(sale_list) database.close()
def process_houses(): database = torndb.Connection(**dbutil.get_mysql_config()) template = { 0: '../iwjw/rent/%s.html', 1: '../iwjw/sale/%s.html' } for result in database.query('select type, houseId from house;'): type, houseId = int(result.type), result.houseId try: process1house(database, houseId, template.get(type)) logger.info('processed#%s' % houseId) except Exception, e: continue