def getMissingDates(data): dbManager = SharedMemoryManager.getInstance(); db = dbManager.query(); missing_dates = {}; today = DateTime.now(tzinfo=msia_tz); # date only state_by = 'state_code'; states = list(db['state'].find({},{'_id': 0, state_by: 1})); current_year = DateTime.getDateCategoryName(date=DateTime.now(tzinfo=msia_tz), element='year'); for rk in data: row = data[rk]; if rk not in missing_dates: missing_dates[rk] = []; dates = groupDates(params={'states': states, 'state_by': state_by}, data=row); for date in dates['missing']: end_date_of_month = DateTime.getDaysAgo(days_to_crawl=1, datefrom=DateTime.getNextMonth(DateTime.convertDateTimeFromString(date))); day_diff = DateTime.getDifferenceBetweenDuration([today, end_date_of_month]); if day_diff >= 0: date_str = DateTime.toString(today); else: date_str = DateTime.toString(end_date_of_month); if date_str not in dates['crawled']: missing_dates[rk].append(date_str); # Logger.v('day_diff', day_diff); # Logger.v('date', DateTime.getDaysAgo(days_to_crawl=1, datefrom=DateTime.getNextMonth(DateTime.convertDateTimeFromString(ed)))); missing_dates[rk] = sorted(list(set(missing_dates[rk])), reverse=True); return missing_dates;
def generateTemplate(params): result = {}; report_keys = fn.getNestedElement(params, 'keys.report', ['procurement', 'budget']); first_date = fn.getNestedElement(params, 'first_date'); last_date = fn.getNestedElement(params, 'last_date'); state_by = fn.getNestedElement(params, 'state_by'); states = fn.getNestedElement(params, 'states'); today = DateTime.now(tzinfo=msia_tz); # date only for rk in report_keys: if rk not in result: result[rk] = {}; for date in DateTime.getBetween([first_date, last_date], element='date')['order']: end_date_of_month = DateTime.getDaysAgo(days_to_crawl=1, datefrom=DateTime.getNextMonth(DateTime.convertDateTimeFromString(date))); year_month = date[:7]; day_diff = DateTime.getDifferenceBetweenDuration([today, end_date_of_month]); if day_diff >= 0: date_str = DateTime.toString(today); else: date_str = DateTime.toString(end_date_of_month); if date_str not in result[rk]: result[rk][date_str] = {}; result[rk][date_str].update({ 'date': date_str, }) for idx in range(0, len(states)): state = states[idx][state_by]; result[rk][date_str].update({ state: 0, }); return result;
def update(data): global msia_tz, column_keymap, collection_name dbManager = SharedMemoryManager.getInstance() db = dbManager.query() state_facility_code = '_'.join( [str(data['state']), str(data['facility_code'])]) if state_facility_code not in list(set(unique_facility)): state_name = fn.getNestedElement(data, 'state') state_code = fn.getNestedElement(data, 'state') facility_name = fn.getNestedElement(data, 'facility_name') facility_code = fn.getNestedElement(data, 'facility_code') date = fn.getNestedElement(data, 'date') date_string = DateTime.toString(date) values = { 'state_name': state_name, 'state_code': state_code, 'facility_name': facility_name, 'facility_code': facility_code, 'state_updated_at': date_string, 'facility_updated_at': date_string, 'date': date_string, } dbManager.addBulkInsert(collection_name, values, batch=True) unique_facility.append(state_facility_code) dbManager.executeBulkOperations(collection_name)
def getBackdateList(params): dbManager = SharedMemoryManager.getInstance() db = dbManager.query() dates = [] for idx in range(0, date_retrieve_limit + 1): # 7 days backward + 1 today if idx == 0: collection_name = 'stock_latest' else: collection_name = 'stock_{0}'.format(idx) # Logger.v('collection_name', collection_name); data = list(db[collection_name].find({}, { '_id': 0, 'date': 1 }).limit(1)) if data: date = DateTime.toString(data[0]['date']) # Logger.v('date', date); dates.append(date) # Logger.v('data', data); # Logger.v('dates', sorted(list(set(dates)), reverse=True)); result = { 'date': sorted(list(set(dates)), reverse=True) } return result
def updateDropdownOptions(params): option_keys = fn.getNestedElement(params, 'keys.option', ['state']) today = fn.getNestedElement( params, 'schedule_params.today', DateTime.toString(DateTime.now(tzinfo=msia_tz))) data = {} crawled_data = {} # crawl from API URL (get options from API) # for key in keys: # url = api_links[key]; # # url = generateUrl(api_links[key]); # response = requests.get(url); # json_response = json.loads(response.text); # Logger.v('json_response', json_response); # crawled_data[key] = json_response; # Logger.v('Crawled', url); # Logger.v('Done crawling.'); # save(data); # read from file for key in option_keys: filename = api_files[key] crawled_data[key] = File.readJson(filename) # convert key to snakecase, value to lower for key in crawled_data: if key not in data: data[key] = [] for idx in range(0, len(crawled_data[key])): row = crawled_data[key][idx] obj_ = {} for row_key in row: row_value = row[row_key] new_key = fn.camelToSnakecase(str=row_key) if type(row_value) == str: new_value = row_value.lower() elif row_value is None: new_value = 'null' else: new_value = row_value obj_[new_key] = new_value data[key].append(obj_) for key in data: folder_path = '/'.join([crawl_folder, key]) if not os.path.exists(folder_path): os.makedirs(folder_path) filename = '{0}/{1}'.format(folder_path, today) Logger.v('Saving', filename) fn.writeJSONFile(filename='{0}.json'.format(filename), data=data[key]) for key in option_keys: directory = '/'.join([crawl_folder, key]) raw = File.readLatestFile(directory=directory) refresh_collection = refreshIsRequired(data=raw, collection_name=key) if refresh_collection: refreshCollection(data=raw, collection_name=key) Logger.v('refreshed', key)
def checkEmpty(params): global global_check_data; dbManager = SharedMemoryManager.getInstance(); db = dbManager.query(); custom_params = copy.deepcopy(params); report_keys = fn.getNestedElement(params, 'keys.report', ['procurement', 'budget']); interval = fn.getNestedElement(params, 'interval', 1); past_dates = DateTime.getPastDate(count=12, duration=interval); # check previous 12 month data year = Crawl.extractYear(data=past_dates[0]); first_date = past_dates[0][-1][0]; last_date = past_dates[0][0][1]; # Logger.v('first_date', first_date, 'last_date', last_date); state_by = 'state_code'; states = list(db['state'].find({},{'_id': 0, state_by: 1})); result = {}; datetime = DateTime.toString(DateTime.now(tzinfo=msia_tz), date_format='%Y-%m-%d-%H-%M-%S'); custom_params['first_date'] = first_date; custom_params['last_date'] = last_date; custom_params['state_by'] = state_by; custom_params['states'] = states; temp_result = generateTemplate(params=custom_params); for rk in report_keys: if rk not in global_check_data: global_check_data[rk] = []; for y in year: root_path = '{0}/{1}/year_{2}'.format(crawl_folder, rk, y); openDir(root_path, rk); for gcd in global_check_data[rk]: date = gcd.split('_')[0]; state = gcd.split('_')[1]; if DateTime.inrange(date, [first_date, last_date]): try: temp_result[rk][date][state] += 1; except Exception as e: # Logger.v('Main.checkEmpty:', e); pass; for rk in temp_result: if rk not in result: result[rk] = []; for date in temp_result[rk]: result[rk].append(temp_result[rk][date]); filename = '{0}/{1}_check_moh_empty'.format(test_folder, rk); # filename = 'tests/{0}_{1}_check_moh_empty'.format(rk, datetime); fn.writeExcelFile(filename=filename, data=result[rk]); global_check_data = {}; return result;
def getCollectionName(params): global latest_collection_name latest_collection_name = 'stock_latest' # set default; dbManager = SharedMemoryManager.getInstance() db = dbManager.query() data = list(db[latest_collection_name].find({}, { '_id': 0, 'date': 1 }).limit(1)) if data: latest_date_string = DateTime.toString(data[0]['date']) latest_date = DateTime.convertDateTimeFromString(latest_date_string) date_string = fn.getNestedElement(params, 'date', None) if date_string: date = DateTime.convertDateTimeFromString(date_string) different = latest_date - date day_diff = math.floor(different.total_seconds() / float(86400)) if day_diff > 0: latest_collection_name = 'stock_{0}'.format(day_diff)
def getIntegrity(params, data): dbManager = SharedMemoryManager.getInstance() db = dbManager.query() check_data = fn.getNestedElement(data, 'check_data') facility = ModelFacility.getActiveFacility() filter_key = fn.getNestedElement(params, 'filter_key') durations = fn.getNestedElement(params, 'durations') result = fn.getNestedElement(data, 'to_update') state_data = fn.getNestedElement(data, 'state') facility_data_by_state = fn.getNestedElement(data, 'facility') data_list = getFacilityByState(params=params, data=check_data) for key in data_list: row = fn.getNestedElement(data_list, key) count = getTotalCount(params={ 'filter_key': filter_key, 'key': key }, data={ 'row': row, 'facility': facility }) obj_ = { 'id': fn.convertToSnakecase(fn.getNestedElement(row, 'id')), 'name': fn.getNestedElement(row, 'name'), 'code': fn.getNestedElement(row, 'code'), 'data': [], } for idx in range(len(durations) - 1, -1, -1): date = durations[idx] previous_date = DateTime.toString( DateTime.getDaysAgo(1, datefrom=date)) # Logger.v('date', date, 'previous_date', previous_date); if filter_key: date_count = fn.getNestedElement( facility_data_by_state, '{0}.{1}.{2}'.format(filter_key, key, date), 0) if not date_count: date_count = 0 else: date_count = 0 # do not include those positive, count missing facility quantity only # date_count = fn.getNestedElement(state_data, '{0}.{1}'.format(key, date), 0); if filter_key: val = date_count - count else: val = 0 obj_['data'].append({ previous_date: val, # negative value is missing, 0 mean complete, positive value is not found from user upload facility }) if filter_key: # Logger.v('recursive end') pass else: obj_['facility'] = [] obj_['facility'] = getIntegrity(params={ 'filter_key': key, 'durations': durations, }, data={ 'state': state_data, 'facility': facility_data_by_state, 'to_update': obj_['facility'], 'check_data': check_data, }) result.append(obj_) # Logger.v('result', result) return result
def generateCrawlParam(params): Debug = DebugManager.DebugManager() Debug.start() global pass_month_quantity dbManager = SharedMemoryManager.getInstance() db = dbManager.query() crawl_params = {} limit_for_test = 10 report_keys = fn.getNestedElement(params, 'keys.report', ['budget', 'procurement']) interval = fn.getNestedElement(params, 'interval', 1) filter_facility_code = fn.getNestedElement(params, 'filter.facility_code', True) check_empty = fn.getNestedElement(params, 'schedule_params.check_empty', False) today = fn.getNestedElement( params, 'schedule_params.today', DateTime.toString(DateTime.now(tzinfo=msia_tz))) # Logger.v('filter_facility_code', filter_facility_code); if check_empty: # past_dates = DateTime.getPastDate(count=pass_month_quantity, duration=interval); past_dates = DateTime.getPastDate( count=pass_month_quantity, duration=interval, end=DateTime.convertDateTimeFromString(today)) # Logger.v('past_dates', past_dates); # exit(); else: past_dates = DateTime.getPastDate(count=pass_month_quantity, duration=interval) # Logger.v('past_dates', past_dates); state_codes = retrieveOption(collection_name='state', show_keys=['state_code'], hide_keys=['_id']) state_code = extractListByKey(data=state_codes, key='state_code') facility_codes = retrieveOption(collection_name='facility', show_keys=['facility_code'], hide_keys=['_id']) facility_code = extractListByKey(data=facility_codes, key='facility_code') for key in report_keys: # Logger.v('collection', key, past_dates[0]); Debug.trace() if key not in crawl_params: crawl_params[key] = [] mongo_data = list(db[key].find({}, {})) if len(mongo_data) == 0: dates = past_dates[0][:] else: dates = past_dates[0][:1] year = extractYear(data=dates) # Logger.v('year', year); # Logger.v('filter_facility_code', filter_facility_code); if key == 'budget': if not filter_facility_code: iteration = 0 total = len(year) * len(state_code) # fn.printProgressBar(iteration=iteration, total=total); for y in year: for sc in state_code: obj_ = { 'financial_year': y, 'state_code': sc, 'page_type': key, 'upid': '_'.join([sc, y]), 'url': api_links[key].format(sc, y, ''), 'start_date': today, 'end_date': today, } if obj_ not in crawl_params[key]: crawl_params[key].append(obj_) # Logger.v('len(crawl_param])', len(crawl_params[key])); iteration += 1 # fn.printProgressBar(iteration=iteration, total=total); else: iteration = 0 total = len(year) * len(state_code) * len( facility_code[:limit_for_test]) # fn.printProgressBar(iteration=iteration, total=total); for y in year: for sc in state_code: for fc in facility_code[:limit_for_test]: obj_ = { 'financial_year': y, 'state_code': sc, 'page_type': key, 'upid': '_'.join([sc, y, fc]), 'facility_code': fc, 'url': api_links[key].format(sc, y, fc), 'start_date': today, 'end_date': today, } if obj_ not in crawl_params[key]: crawl_params[key].append(obj_) # Logger.v('len(crawl_param])', len(crawl_params[key])); iteration += 1 # fn.printProgressBar(iteration=iteration, total=total); elif key == 'procurement': if not filter_facility_code: for past_duration in dates: start_date = DateTime.toString( DateTime.getDaysAgo(days_to_crawl=-1, datefrom=past_duration[0])) end_date = DateTime.toString( DateTime.getDaysAgo(days_to_crawl=1, datefrom=past_duration[1])) for sc in state_code: obj_ = { 'state_code': sc, 'start_date': start_date, 'end_date': end_date, 'page_type': key, 'upid': '_'.join([sc, start_date, end_date]), 'url': api_links[key].format(sc, start_date.replace('-', ''), end_date.replace('-', ''), ''), } if obj_ not in crawl_params[key]: crawl_params[key].append(obj_) # Logger.v('len(crawl_param])', len(crawl_params[key])); else: for past_duration in dates: start_date = DateTime.toString( DateTime.getDaysAgo(days_to_crawl=-1, datefrom=past_duration[0])) end_date = DateTime.toString( DateTime.getDaysAgo(days_to_crawl=1, datefrom=past_duration[1])) for sc in state_code: for fc in facility_code[:limit_for_test]: obj_ = { 'state_code': sc, 'start_date': start_date, 'end_date': end_date, 'page_type': key, 'facility_code': fc, 'upid': '_'.join([sc, start_date, end_date, fc]), 'url': api_links[key].format( sc, start_date.replace('-', ''), end_date.replace('-', ''), fc) } if obj_ not in crawl_params[key]: crawl_params[key].append(obj_) # Logger.v('len(crawl_param])', len(crawl_params[key])); for c in crawl_params: # Logger.v('crawl_params', c, len(crawl_params[c])); fn.writeExcelFile(filename='{0}/{1}'.format(test_folder, c), data=crawl_params[c]) Logger.v('crawl_params', len(crawl_params)) Debug.show('Generate Crawl Params') return crawl_params