def _crawl_dig(last_dig):
    the_url = 'http://61.60.124.185/tpctempdig/InfoAllList.asp'
    last_timestamp = last_dig.get('latest_timestamp', 10000)
    start_timestamp = last_timestamp - 86400
    end_timestamp = last_timestamp + 86400

    (start_year, start_month, start_day) = util.timestamp_to_datetime(start_timestamp)
    (end_year, end_month, end_day) = util.timestamp_to_datetime(end_timestamp)

    params = {
        'sortflag': '',
        'sorttype': '',
        'TargetLB': '',
        'qry2': 1,
        'startyear': start_year,
        'startmonth': start_month,
        'endyear': end_year,
        'endmonth': end_month,
        'endday': end_day
    }

    http_data = util.http_multipost({the_url: params})
    #cfg.logger.debug('http_data: %s', http_data)
    (latest_timestamp, dig_data) = _parse_dig(http_data[the_url])

    [_put_to_db(each_data) for each_data in dig_data]

    util.save_cache('cron_new_taipei_city_latest_dig', {'latest_timestamp': latest_timestamp})
Example #2
0
def _crawl_dig():
    the_url = 'http://61.60.124.185/tpctempdig/InfoAllList.asp'
    start_timestamp = 946684800
    end_timestamp = util.get_timestamp() + 86400 * 366

    start_datetime = util.timestamp_to_datetime(start_timestamp)
    end_datetime = util.timestamp_to_datetime(end_timestamp)

    params = {
        'sortflag': '',
        'sorttype': '',
        'TargetLB': '',
        'qry2': 1,
        'startyear': start_datetime.year,
        'startmonth': start_datetime.month,
        'startday': start_datetime.day,
        'endyear': end_datetime.year,
        'endmonth': end_datetime.month,
        'endday': end_datetime.day,
    }

    http_data = util.http_multipost({the_url: params})
    #cfg.logger.debug('http_data: %s', http_data)
    (latest_timestamp, dig_data) = _parse_dig(http_data[the_url])

    [_put_to_db(each_data) for each_data in dig_data]

    util.save_cache('cron_new_taipei_city_latest_dig',
                    {'latest_timestamp': latest_timestamp})
def _crawl_dig():
    the_url = 'http://61.60.124.185/tpctempdig/InfoAllList.asp'
    start_timestamp = 946684800
    end_timestamp = util.get_timestamp() + 86400 * 366

    start_datetime = util.timestamp_to_datetime(start_timestamp)
    end_datetime = util.timestamp_to_datetime(end_timestamp)

    params = {
        'sortflag': '',
        'sorttype': '',
        'TargetLB': '',
        'qry2': 1,
        'startyear': start_datetime.year,
        'startmonth': start_datetime.month,
        'startday': start_datetime.day,
        'endyear': end_datetime.year,
        'endmonth': end_datetime.month,
        'endday': end_datetime.day,
    }

    http_data = util.http_multipost({the_url: params})
    #cfg.logger.debug('http_data: %s', http_data)
    (latest_timestamp, dig_data) = _parse_dig(http_data[the_url])

    [_put_to_db(each_data) for each_data in dig_data]

    util.save_cache('cron_new_taipei_city_latest_dig', {'latest_timestamp': latest_timestamp})
Example #4
0
def _save_img(data, postfix, content_type):
    the_timestamp = util.get_timestamp()
    the_datetime = util.timestamp_to_datetime(the_timestamp)
    the_id = str(the_timestamp) + "_" + util.uuid()
    filename = the_id + '.' + postfix

    the_dir = '/data/img/bee/' + the_datetime.strftime('%Y-%m-%d')

    util.makedirs(the_dir)

    with open(the_dir + '/' + filename, 'w') as f:
        f.write(data)

    (the_thumbnail, thumbnail_postfix) = _make_thumbnail(data, postfix)
    
    the_dir = '/data/thumbnail/bee/' + the_datetime.strftime('%Y-%m-%d')

    util.makedirs(the_dir)

    thumbnail_filename = the_id + '.' + thumbnail_postfix

    with open(the_dir + '/' + thumbnail_filename, 'w') as f:
        f.write(the_thumbnail)

    db_data = {"filename": the_datetime.strftime('%Y-%m-%d/') + filename, "thumbnail_filename": the_datetime.strftime("%Y-%m-%d/") + thumbnail_filename, "the_id": the_id, 'content_type': content_type, 'save_time': the_timestamp}

    util.db_insert('bee_img', [db_data])

    if '_id' in db_data:
        del db_data['_id']

    return db_data
Example #5
0
def _crawl_dig_point(next_dig_point):
    results = {}

    offset_dig_point = next_dig_point
    current_timestamp = util.get_timestamp()
    the_datetime = util.timestamp_to_datetime(current_timestamp)
    current_year = the_datetime.year

    cfg.logger.debug('current_year: %s', current_year)

    for idx in range(0, N_ITER_CRAWL_DIG_POINT):
        (error_code, next_dig_point, offset_dig_point,
         iter_results) = _iter_crawl_dig_point(next_dig_point,
                                               offset_dig_point)
        results.update(iter_results)

        offset_dig_point_year = offset_dig_point // 100000 + 1911
        offset_dig_point_mod_100000 = offset_dig_point % 100000

        cfg.logger.debug(
            'offset_dig_point_year: %s offset_dig_point_mod_100000: %s',
            offset_dig_point_year, offset_dig_point_mod_100000)
        if offset_dig_point_year != current_year and offset_dig_point_mod_100000 >= 30000:
            break

        sleep_time = cfg.config.get('time_sleep', 30)
        cfg.logger.debug('to sleep %s', sleep_time)
        time.sleep(sleep_time)

    results_list = results.values()

    return (next_dig_point, results_list)
Example #6
0
def _save_img(data, postfix, content_type):
    the_timestamp = util.get_timestamp()
    the_datetime = util.timestamp_to_datetime(the_timestamp)
    the_id = str(the_timestamp) + "_" + util.uuid()
    filename = the_id + '.' + postfix

    the_dir = '/data/img/bee/' + the_datetime.strftime('%Y-%m-%d')

    util.makedirs(the_dir)

    with open(the_dir + '/' + filename, 'w') as f:
        f.write(data)

    (the_thumbnail, thumbnail_postfix) = _make_thumbnail(data, postfix)
    
    the_dir = '/data/thumbnail/bee/' + the_datetime.strftime('%Y-%m-%d')

    util.makedirs(the_dir)

    thumbnail_filename = the_id + '.' + thumbnail_postfix

    with open(the_dir + '/' + thumbnail_filename, 'w') as f:
        f.write(the_thumbnail)

    db_data = {"filename": the_datetime.strftime('%Y-%m-%d/') + filename, "thumbnail_filename": the_datetime.strftime("%Y-%m-%d/") + thumbnail_filename, "the_id": the_id, 'content_type': content_type, 'save_time': the_timestamp}

    util.db_insert('bee_img', [db_data])

    if '_id' in db_data:
        del db_data['_id']

    return db_data
def g_new_taipei_city_dig_point_next_year_handler():
    db_results = util.db_find_it('roadDB',
                                 {'the_category': 'new_taipei_city_dig_point'},
                                 {
                                     '_id': False,
                                     'end_timestamp': True
                                 })
    if not db_results:
        return START_NEW_TAIPEI_CITY_DIG_POINT_YEAR

    db_result = db_results.sort('end_timestamp', pymongo.DESCENDING).limit(1)

    if not db_result:
        return START_NEW_TAIPEI_CITY_DIG_POINT_YEAR

    result_list = list(db_result)

    if not result_list:
        return START_NEW_TAIPEI_CITY_DIG_POINT_YEAR

    result = result_list[0]

    end_timestamp = result.get('end_timestamp', MAX_TIMESTAMP)  #1000.0
    end_datetime = util.timestamp_to_datetime(end_timestamp)
    the_year = end_datetime.year

    return the_year
def _crawl_dig_point(next_dig_point):
    results = {}

    offset_dig_point = next_dig_point
    current_timestamp = util.get_timestamp()
    the_datetime = util.timestamp_to_datetime(current_timestamp)
    current_year = the_datetime.year

    cfg.logger.debug('current_year: %s', current_year)

    for idx in range(0, N_ITER_CRAWL_DIG_POINT):
        (error_code, next_dig_point, offset_dig_point, iter_results) = _iter_crawl_dig_point(next_dig_point, offset_dig_point)
        results.update(iter_results)

        offset_dig_point_year = offset_dig_point // 100000 + 1911
        offset_dig_point_mod_100000 = offset_dig_point % 100000

        cfg.logger.debug('offset_dig_point_year: %s offset_dig_point_mod_100000: %s', offset_dig_point_year, offset_dig_point_mod_100000)
        if offset_dig_point_year != current_year and offset_dig_point_mod_100000 >= 30000:
            break

        sleep_time = cfg.config.get('time_sleep', 30)
        cfg.logger.debug('to sleep %s', sleep_time)
        time.sleep(sleep_time)

    results_list = results.values()

    return (next_dig_point, results_list)
Example #9
0
def _parse_deliver_date(x, funnel_dict):
    deliver_time = x.get(u'deliver_time', 0)
    the_datetime = util.timestamp_to_datetime(deliver_time)

    result = the_datetime.strftime('%Y-%m-%dT%H:%M:%S.%fZ')

    #cfg.logger.debug('deliver_time: %s deliver_date: %s', deliver_time, result)

    return result
def g_new_taipei_city_dig_point_next_year_handler():
    db_results = util.db_find_it('roadDB', {'the_category': 'new_taipei_city_dig_point'}, {'_id': False, 'end_timestamp': True})
    if not db_results:
        return START_NEW_TAIPEI_CITY_DIG_POINT_YEAR

    db_result = db_results.sort('end_timestamp', pymongo.DESCENDING).limit(1)

    if not db_result:
        return START_NEW_TAIPEI_CITY_DIG_POINT_YEAR


    result_list = list(db_result)

    if not result_list:
        return START_NEW_TAIPEI_CITY_DIG_POINT_YEAR

    result = result_list[0]

    end_timestamp = result.get('end_timestamp', MAX_TIMESTAMP) #1000.0
    end_datetime = util.timestamp_to_datetime(end_timestamp)
    the_year = end_datetime.year

    return the_year