time.sleep(2) detail_page_obj = get(detail_address_url, use_proxy=False) if not detail_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, detail_address_url)) detail_address_url = None continue page_res_list, next_page_url = parse_page( city_url, detail_page_obj) if next_page_url: detail_address_url = next_page_url[0] else: detail_address_url = None #print 'next', detail_address_url res = record_res(page_res_list, middleman_type) if not res: logging.error("%s: Cannot record res, url: %s" % (middleman_type, detail_address_url)) if __name__ == '__main__': middleman_type = 'woaiwojia' logging.basicConfig( level=logging.WARNING, format= '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename=get_log_path(middleman_type), filemode='w') crawl(middleman_type)
middleman_count_dict = { 'woaiwojia': 0, 'maitian': 0, 'sohujiaodian': 0, 'anjuke': 0, 'tuitui99': 0, 'fangtianxia': 0 } logging.basicConfig( level=logging.INFO, format= '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename=get_log_path('middleman'), filemode='a') def update_db(): conn = connect_to_db() for basedir, subdirs, filenames in os.walk(get_result_path()): for filename in filenames: middleman = os.path.splitext(filename)[0].lstrip('u_') source = middleman_to_source_dict[middleman] delete_all(source) tag = '房产中介' with open(os.path.join(basedir, filename), 'rb') as f_in: for line in f_in: line = line.lstrip('\n\r') if not line: