def _crawl_dig(last_dig): the_url = 'http://61.60.124.185/tpctempdig/InfoAllList.asp' last_timestamp = last_dig.get('latest_timestamp', 10000) start_timestamp = last_timestamp - 86400 end_timestamp = last_timestamp + 86400 (start_year, start_month, start_day) = util.timestamp_to_datetime(start_timestamp) (end_year, end_month, end_day) = util.timestamp_to_datetime(end_timestamp) params = { 'sortflag': '', 'sorttype': '', 'TargetLB': '', 'qry2': 1, 'startyear': start_year, 'startmonth': start_month, 'endyear': end_year, 'endmonth': end_month, 'endday': end_day } http_data = util.http_multipost({the_url: params}) #cfg.logger.debug('http_data: %s', http_data) (latest_timestamp, dig_data) = _parse_dig(http_data[the_url]) [_put_to_db(each_data) for each_data in dig_data] util.save_cache('cron_new_taipei_city_latest_dig', {'latest_timestamp': latest_timestamp})
def _crawl_dig(): the_url = 'http://61.60.124.185/tpctempdig/InfoAllList.asp' start_timestamp = 946684800 end_timestamp = util.get_timestamp() + 86400 * 366 start_datetime = util.timestamp_to_datetime(start_timestamp) end_datetime = util.timestamp_to_datetime(end_timestamp) params = { 'sortflag': '', 'sorttype': '', 'TargetLB': '', 'qry2': 1, 'startyear': start_datetime.year, 'startmonth': start_datetime.month, 'startday': start_datetime.day, 'endyear': end_datetime.year, 'endmonth': end_datetime.month, 'endday': end_datetime.day, } http_data = util.http_multipost({the_url: params}) #cfg.logger.debug('http_data: %s', http_data) (latest_timestamp, dig_data) = _parse_dig(http_data[the_url]) [_put_to_db(each_data) for each_data in dig_data] util.save_cache('cron_new_taipei_city_latest_dig', {'latest_timestamp': latest_timestamp})
def _save_img(data, postfix, content_type): the_timestamp = util.get_timestamp() the_datetime = util.timestamp_to_datetime(the_timestamp) the_id = str(the_timestamp) + "_" + util.uuid() filename = the_id + '.' + postfix the_dir = '/data/img/bee/' + the_datetime.strftime('%Y-%m-%d') util.makedirs(the_dir) with open(the_dir + '/' + filename, 'w') as f: f.write(data) (the_thumbnail, thumbnail_postfix) = _make_thumbnail(data, postfix) the_dir = '/data/thumbnail/bee/' + the_datetime.strftime('%Y-%m-%d') util.makedirs(the_dir) thumbnail_filename = the_id + '.' + thumbnail_postfix with open(the_dir + '/' + thumbnail_filename, 'w') as f: f.write(the_thumbnail) db_data = {"filename": the_datetime.strftime('%Y-%m-%d/') + filename, "thumbnail_filename": the_datetime.strftime("%Y-%m-%d/") + thumbnail_filename, "the_id": the_id, 'content_type': content_type, 'save_time': the_timestamp} util.db_insert('bee_img', [db_data]) if '_id' in db_data: del db_data['_id'] return db_data
def _crawl_dig_point(next_dig_point): results = {} offset_dig_point = next_dig_point current_timestamp = util.get_timestamp() the_datetime = util.timestamp_to_datetime(current_timestamp) current_year = the_datetime.year cfg.logger.debug('current_year: %s', current_year) for idx in range(0, N_ITER_CRAWL_DIG_POINT): (error_code, next_dig_point, offset_dig_point, iter_results) = _iter_crawl_dig_point(next_dig_point, offset_dig_point) results.update(iter_results) offset_dig_point_year = offset_dig_point // 100000 + 1911 offset_dig_point_mod_100000 = offset_dig_point % 100000 cfg.logger.debug( 'offset_dig_point_year: %s offset_dig_point_mod_100000: %s', offset_dig_point_year, offset_dig_point_mod_100000) if offset_dig_point_year != current_year and offset_dig_point_mod_100000 >= 30000: break sleep_time = cfg.config.get('time_sleep', 30) cfg.logger.debug('to sleep %s', sleep_time) time.sleep(sleep_time) results_list = results.values() return (next_dig_point, results_list)
def g_new_taipei_city_dig_point_next_year_handler(): db_results = util.db_find_it('roadDB', {'the_category': 'new_taipei_city_dig_point'}, { '_id': False, 'end_timestamp': True }) if not db_results: return START_NEW_TAIPEI_CITY_DIG_POINT_YEAR db_result = db_results.sort('end_timestamp', pymongo.DESCENDING).limit(1) if not db_result: return START_NEW_TAIPEI_CITY_DIG_POINT_YEAR result_list = list(db_result) if not result_list: return START_NEW_TAIPEI_CITY_DIG_POINT_YEAR result = result_list[0] end_timestamp = result.get('end_timestamp', MAX_TIMESTAMP) #1000.0 end_datetime = util.timestamp_to_datetime(end_timestamp) the_year = end_datetime.year return the_year
def _crawl_dig_point(next_dig_point): results = {} offset_dig_point = next_dig_point current_timestamp = util.get_timestamp() the_datetime = util.timestamp_to_datetime(current_timestamp) current_year = the_datetime.year cfg.logger.debug('current_year: %s', current_year) for idx in range(0, N_ITER_CRAWL_DIG_POINT): (error_code, next_dig_point, offset_dig_point, iter_results) = _iter_crawl_dig_point(next_dig_point, offset_dig_point) results.update(iter_results) offset_dig_point_year = offset_dig_point // 100000 + 1911 offset_dig_point_mod_100000 = offset_dig_point % 100000 cfg.logger.debug('offset_dig_point_year: %s offset_dig_point_mod_100000: %s', offset_dig_point_year, offset_dig_point_mod_100000) if offset_dig_point_year != current_year and offset_dig_point_mod_100000 >= 30000: break sleep_time = cfg.config.get('time_sleep', 30) cfg.logger.debug('to sleep %s', sleep_time) time.sleep(sleep_time) results_list = results.values() return (next_dig_point, results_list)
def _parse_deliver_date(x, funnel_dict): deliver_time = x.get(u'deliver_time', 0) the_datetime = util.timestamp_to_datetime(deliver_time) result = the_datetime.strftime('%Y-%m-%dT%H:%M:%S.%fZ') #cfg.logger.debug('deliver_time: %s deliver_date: %s', deliver_time, result) return result
def g_new_taipei_city_dig_point_next_year_handler(): db_results = util.db_find_it('roadDB', {'the_category': 'new_taipei_city_dig_point'}, {'_id': False, 'end_timestamp': True}) if not db_results: return START_NEW_TAIPEI_CITY_DIG_POINT_YEAR db_result = db_results.sort('end_timestamp', pymongo.DESCENDING).limit(1) if not db_result: return START_NEW_TAIPEI_CITY_DIG_POINT_YEAR result_list = list(db_result) if not result_list: return START_NEW_TAIPEI_CITY_DIG_POINT_YEAR result = result_list[0] end_timestamp = result.get('end_timestamp', MAX_TIMESTAMP) #1000.0 end_datetime = util.timestamp_to_datetime(end_timestamp) the_year = end_datetime.year return the_year