def getMissingDates(data): dbManager = SharedMemoryManager.getInstance(); db = dbManager.query(); missing_dates = {}; today = DateTime.now(tzinfo=msia_tz); # date only state_by = 'state_code'; states = list(db['state'].find({},{'_id': 0, state_by: 1})); current_year = DateTime.getDateCategoryName(date=DateTime.now(tzinfo=msia_tz), element='year'); for rk in data: row = data[rk]; if rk not in missing_dates: missing_dates[rk] = []; dates = groupDates(params={'states': states, 'state_by': state_by}, data=row); for date in dates['missing']: end_date_of_month = DateTime.getDaysAgo(days_to_crawl=1, datefrom=DateTime.getNextMonth(DateTime.convertDateTimeFromString(date))); day_diff = DateTime.getDifferenceBetweenDuration([today, end_date_of_month]); if day_diff >= 0: date_str = DateTime.toString(today); else: date_str = DateTime.toString(end_date_of_month); if date_str not in dates['crawled']: missing_dates[rk].append(date_str); # Logger.v('day_diff', day_diff); # Logger.v('date', DateTime.getDaysAgo(days_to_crawl=1, datefrom=DateTime.getNextMonth(DateTime.convertDateTimeFromString(ed)))); missing_dates[rk] = sorted(list(set(missing_dates[rk])), reverse=True); return missing_dates;
def recordTime(key): global global_text new_key = key.replace('end', 'start') if new_key not in global_text: global_text[new_key] = DateTime.now(tzinfo=msia_tz) if new_key in global_text and not key == new_key: global_text[key] = DateTime.now(tzinfo=msia_tz)
def stockIntegrity(params, data): Debug = DebugManager.DebugManager() Debug.start() global msia_tz, date_retrieve_limit result = [] today = DateTime.now(tzinfo=msia_tz) start_date = DateTime.getDaysAgo(date_retrieve_limit, datefrom=today) durations = DateTime.getBetween([start_date, today], element='date', offset=24)['order'] # offset 24 hour to include today state_data = fn.getNestedElement(data, 'state') facility_data_by_state = fn.getNestedElement(data, 'state_facility') check_data = combinedFacilityList(data=facility_data_by_state) result = getIntegrity(params={ 'durations': durations, }, data={ 'facility': facility_data_by_state, 'state': state_data, 'to_update': result, 'check_data': check_data, }) updateStateData(result) result = list(sorted(result, key=lambda k: k['name'], reverse=False)) Debug.end() Debug.show('Model.Structure.stockIntegrity') return result
def generateTemplate(params): result = {}; report_keys = fn.getNestedElement(params, 'keys.report', ['procurement', 'budget']); first_date = fn.getNestedElement(params, 'first_date'); last_date = fn.getNestedElement(params, 'last_date'); state_by = fn.getNestedElement(params, 'state_by'); states = fn.getNestedElement(params, 'states'); today = DateTime.now(tzinfo=msia_tz); # date only for rk in report_keys: if rk not in result: result[rk] = {}; for date in DateTime.getBetween([first_date, last_date], element='date')['order']: end_date_of_month = DateTime.getDaysAgo(days_to_crawl=1, datefrom=DateTime.getNextMonth(DateTime.convertDateTimeFromString(date))); year_month = date[:7]; day_diff = DateTime.getDifferenceBetweenDuration([today, end_date_of_month]); if day_diff >= 0: date_str = DateTime.toString(today); else: date_str = DateTime.toString(end_date_of_month); if date_str not in result[rk]: result[rk][date_str] = {}; result[rk][date_str].update({ 'date': date_str, }) for idx in range(0, len(states)): state = states[idx][state_by]; result[rk][date_str].update({ state: 0, }); return result;
def check(params): global msia_tz, date_retrieve_limit, date_count, collection_name dbManager = SharedMemoryManager.getInstance() db = dbManager.query() today = DateTime.now(tzinfo=msia_tz) start_date = DateTime.getDaysAgo(date_retrieve_limit, datefrom=today) durations = DateTime.getBetween([start_date, today], element='date', offset=24)['order'] # offset 24 to include today Logger.v('durations', durations) data = db[collection_name].aggregate([{ '$match': { 'state_updated_at': { '$in': durations }, 'facility_updated_at': { '$in': durations } } }, { '$project': { '_id': 0, 'inserted_at': 0, 'updated_at': 0 } }]) data = list(data) Logger.v('Total stock issue integrity in', date_retrieve_limit, 'days:', len(data)) state_data = {} facility_data_by_state = {} for idx in range(0, len(data)): row = data[idx] state_code = fn.getNestedElement(row, 'state_code') if state_code not in facility_data_by_state: facility_data_by_state[state_code] = {} state_data = addIntegrityData(data={ 'row': row, 'to_update': state_data }, category='state') facility_data_by_state[state_code] = addIntegrityData( data={ 'row': row, 'to_update': facility_data_by_state[state_code] }, category='facility') if date_count > date_retrieve_limit: # limit loop data/ show data in N days break date_count = 0 # reset to 0th day return { 'state': state_data, 'state_facility': facility_data_by_state, }
def updateDropdownOptions(params): option_keys = fn.getNestedElement(params, 'keys.option', ['state']) today = fn.getNestedElement( params, 'schedule_params.today', DateTime.toString(DateTime.now(tzinfo=msia_tz))) data = {} crawled_data = {} # crawl from API URL (get options from API) # for key in keys: # url = api_links[key]; # # url = generateUrl(api_links[key]); # response = requests.get(url); # json_response = json.loads(response.text); # Logger.v('json_response', json_response); # crawled_data[key] = json_response; # Logger.v('Crawled', url); # Logger.v('Done crawling.'); # save(data); # read from file for key in option_keys: filename = api_files[key] crawled_data[key] = File.readJson(filename) # convert key to snakecase, value to lower for key in crawled_data: if key not in data: data[key] = [] for idx in range(0, len(crawled_data[key])): row = crawled_data[key][idx] obj_ = {} for row_key in row: row_value = row[row_key] new_key = fn.camelToSnakecase(str=row_key) if type(row_value) == str: new_value = row_value.lower() elif row_value is None: new_value = 'null' else: new_value = row_value obj_[new_key] = new_value data[key].append(obj_) for key in data: folder_path = '/'.join([crawl_folder, key]) if not os.path.exists(folder_path): os.makedirs(folder_path) filename = '{0}/{1}'.format(folder_path, today) Logger.v('Saving', filename) fn.writeJSONFile(filename='{0}.json'.format(filename), data=data[key]) for key in option_keys: directory = '/'.join([crawl_folder, key]) raw = File.readLatestFile(directory=directory) refresh_collection = refreshIsRequired(data=raw, collection_name=key) if refresh_collection: refreshCollection(data=raw, collection_name=key) Logger.v('refreshed', key)
def checkEmpty(params): global global_check_data; dbManager = SharedMemoryManager.getInstance(); db = dbManager.query(); custom_params = copy.deepcopy(params); report_keys = fn.getNestedElement(params, 'keys.report', ['procurement', 'budget']); interval = fn.getNestedElement(params, 'interval', 1); past_dates = DateTime.getPastDate(count=12, duration=interval); # check previous 12 month data year = Crawl.extractYear(data=past_dates[0]); first_date = past_dates[0][-1][0]; last_date = past_dates[0][0][1]; # Logger.v('first_date', first_date, 'last_date', last_date); state_by = 'state_code'; states = list(db['state'].find({},{'_id': 0, state_by: 1})); result = {}; datetime = DateTime.toString(DateTime.now(tzinfo=msia_tz), date_format='%Y-%m-%d-%H-%M-%S'); custom_params['first_date'] = first_date; custom_params['last_date'] = last_date; custom_params['state_by'] = state_by; custom_params['states'] = states; temp_result = generateTemplate(params=custom_params); for rk in report_keys: if rk not in global_check_data: global_check_data[rk] = []; for y in year: root_path = '{0}/{1}/year_{2}'.format(crawl_folder, rk, y); openDir(root_path, rk); for gcd in global_check_data[rk]: date = gcd.split('_')[0]; state = gcd.split('_')[1]; if DateTime.inrange(date, [first_date, last_date]): try: temp_result[rk][date][state] += 1; except Exception as e: # Logger.v('Main.checkEmpty:', e); pass; for rk in temp_result: if rk not in result: result[rk] = []; for date in temp_result[rk]: result[rk].append(temp_result[rk][date]); filename = '{0}/{1}_check_moh_empty'.format(test_folder, rk); # filename = 'tests/{0}_{1}_check_moh_empty'.format(rk, datetime); fn.writeExcelFile(filename=filename, data=result[rk]); global_check_data = {}; return result;
def run(params): Mail.send( '{0} Crawl - Store starts'.format( DateTime.getReadableDate(DateTime.now())), 'Start at: {0}'.format(DateTime.now(tzinfo=msia_tz))) recordTime(key='create_schedule_start_time') Debug = DebugManager.DebugManager() Debug.start() start_crawl = fn.getNestedElement(params, 'schedule_params.start_crawl', False) check_empty = fn.getNestedElement(params, 'schedule_params.check_empty', False) Logger.v('Creating schedule:') updateDropdownOptions(params=params) crawl_params = generateCrawlParam(params) Debug.trace('Generate Crawl Params') createSchedules({'pages': crawl_params}) Debug.trace('Create Schedule') if start_crawl: Crawl.start(params) Debug.trace('crawling') recordTime(key='create_schedule_end_time') Debug.show('Run')
#!/usr/bin/python3 from Crawl import Main as Crawl from lib import fn from lib import DateTime try: start_time = DateTime.now() fn.writeTestFile( filename='cron_start_{0}'.format(DateTime.now()), data='successful cron tab, cron time:{0}'.format(start_time)) Crawl.run({ 'process': ['schedule', 'crawl', 'update'], # 'schedule', 'crawl', 'update' 'schedule_params': { 'start_crawl': False, 'check_empty': True, }, 'interval': 1, # months between start date - end date 'keys': { 'option': ['state', 'ptj', 'facility', 'facility_type'], 'report': ['budget', 'procurement'], # 'budget', 'procurement' }, 'filter': { 'facility_code': False, }, }) end_time = DateTime.now() print('end_time', end_time) fn.writeTestFile( filename='cron_end_{0}'.format(end_time),
def generateCrawlParam(params): Debug = DebugManager.DebugManager() Debug.start() global pass_month_quantity dbManager = SharedMemoryManager.getInstance() db = dbManager.query() crawl_params = {} limit_for_test = 10 report_keys = fn.getNestedElement(params, 'keys.report', ['budget', 'procurement']) interval = fn.getNestedElement(params, 'interval', 1) filter_facility_code = fn.getNestedElement(params, 'filter.facility_code', True) check_empty = fn.getNestedElement(params, 'schedule_params.check_empty', False) today = fn.getNestedElement( params, 'schedule_params.today', DateTime.toString(DateTime.now(tzinfo=msia_tz))) # Logger.v('filter_facility_code', filter_facility_code); if check_empty: # past_dates = DateTime.getPastDate(count=pass_month_quantity, duration=interval); past_dates = DateTime.getPastDate( count=pass_month_quantity, duration=interval, end=DateTime.convertDateTimeFromString(today)) # Logger.v('past_dates', past_dates); # exit(); else: past_dates = DateTime.getPastDate(count=pass_month_quantity, duration=interval) # Logger.v('past_dates', past_dates); state_codes = retrieveOption(collection_name='state', show_keys=['state_code'], hide_keys=['_id']) state_code = extractListByKey(data=state_codes, key='state_code') facility_codes = retrieveOption(collection_name='facility', show_keys=['facility_code'], hide_keys=['_id']) facility_code = extractListByKey(data=facility_codes, key='facility_code') for key in report_keys: # Logger.v('collection', key, past_dates[0]); Debug.trace() if key not in crawl_params: crawl_params[key] = [] mongo_data = list(db[key].find({}, {})) if len(mongo_data) == 0: dates = past_dates[0][:] else: dates = past_dates[0][:1] year = extractYear(data=dates) # Logger.v('year', year); # Logger.v('filter_facility_code', filter_facility_code); if key == 'budget': if not filter_facility_code: iteration = 0 total = len(year) * len(state_code) # fn.printProgressBar(iteration=iteration, total=total); for y in year: for sc in state_code: obj_ = { 'financial_year': y, 'state_code': sc, 'page_type': key, 'upid': '_'.join([sc, y]), 'url': api_links[key].format(sc, y, ''), 'start_date': today, 'end_date': today, } if obj_ not in crawl_params[key]: crawl_params[key].append(obj_) # Logger.v('len(crawl_param])', len(crawl_params[key])); iteration += 1 # fn.printProgressBar(iteration=iteration, total=total); else: iteration = 0 total = len(year) * len(state_code) * len( facility_code[:limit_for_test]) # fn.printProgressBar(iteration=iteration, total=total); for y in year: for sc in state_code: for fc in facility_code[:limit_for_test]: obj_ = { 'financial_year': y, 'state_code': sc, 'page_type': key, 'upid': '_'.join([sc, y, fc]), 'facility_code': fc, 'url': api_links[key].format(sc, y, fc), 'start_date': today, 'end_date': today, } if obj_ not in crawl_params[key]: crawl_params[key].append(obj_) # Logger.v('len(crawl_param])', len(crawl_params[key])); iteration += 1 # fn.printProgressBar(iteration=iteration, total=total); elif key == 'procurement': if not filter_facility_code: for past_duration in dates: start_date = DateTime.toString( DateTime.getDaysAgo(days_to_crawl=-1, datefrom=past_duration[0])) end_date = DateTime.toString( DateTime.getDaysAgo(days_to_crawl=1, datefrom=past_duration[1])) for sc in state_code: obj_ = { 'state_code': sc, 'start_date': start_date, 'end_date': end_date, 'page_type': key, 'upid': '_'.join([sc, start_date, end_date]), 'url': api_links[key].format(sc, start_date.replace('-', ''), end_date.replace('-', ''), ''), } if obj_ not in crawl_params[key]: crawl_params[key].append(obj_) # Logger.v('len(crawl_param])', len(crawl_params[key])); else: for past_duration in dates: start_date = DateTime.toString( DateTime.getDaysAgo(days_to_crawl=-1, datefrom=past_duration[0])) end_date = DateTime.toString( DateTime.getDaysAgo(days_to_crawl=1, datefrom=past_duration[1])) for sc in state_code: for fc in facility_code[:limit_for_test]: obj_ = { 'state_code': sc, 'start_date': start_date, 'end_date': end_date, 'page_type': key, 'facility_code': fc, 'upid': '_'.join([sc, start_date, end_date, fc]), 'url': api_links[key].format( sc, start_date.replace('-', ''), end_date.replace('-', ''), fc) } if obj_ not in crawl_params[key]: crawl_params[key].append(obj_) # Logger.v('len(crawl_param])', len(crawl_params[key])); for c in crawl_params: # Logger.v('crawl_params', c, len(crawl_params[c])); fn.writeExcelFile(filename='{0}/{1}'.format(test_folder, c), data=crawl_params[c]) Logger.v('crawl_params', len(crawl_params)) Debug.show('Generate Crawl Params') return crawl_params