Example #1
0
def getMissingDates(data):
	dbManager = SharedMemoryManager.getInstance();
	db = dbManager.query();
	missing_dates = {};
	today = DateTime.now(tzinfo=msia_tz); # date only
	state_by = 'state_code';
	states = list(db['state'].find({},{'_id': 0, state_by: 1}));
	current_year = DateTime.getDateCategoryName(date=DateTime.now(tzinfo=msia_tz), element='year');
	for rk in data:
		row = data[rk];
		if rk not in missing_dates:
			missing_dates[rk] = [];

		dates = groupDates(params={'states': states, 'state_by': state_by}, data=row);
		for date in dates['missing']:
			end_date_of_month = DateTime.getDaysAgo(days_to_crawl=1, datefrom=DateTime.getNextMonth(DateTime.convertDateTimeFromString(date)));
			day_diff = DateTime.getDifferenceBetweenDuration([today, end_date_of_month]);

			if day_diff >= 0:
				date_str = DateTime.toString(today);
			else:
				date_str = DateTime.toString(end_date_of_month);

			if date_str not in dates['crawled']:
				missing_dates[rk].append(date_str);

			# Logger.v('day_diff', day_diff);
			# Logger.v('date', DateTime.getDaysAgo(days_to_crawl=1, datefrom=DateTime.getNextMonth(DateTime.convertDateTimeFromString(ed))));
		missing_dates[rk] = sorted(list(set(missing_dates[rk])), reverse=True);
	return missing_dates;
Example #2
0
def recordTime(key):
    global global_text
    new_key = key.replace('end', 'start')
    if new_key not in global_text:
        global_text[new_key] = DateTime.now(tzinfo=msia_tz)

    if new_key in global_text and not key == new_key:
        global_text[key] = DateTime.now(tzinfo=msia_tz)
Example #3
0
def stockIntegrity(params, data):
    Debug = DebugManager.DebugManager()
    Debug.start()
    global msia_tz, date_retrieve_limit
    result = []
    today = DateTime.now(tzinfo=msia_tz)
    start_date = DateTime.getDaysAgo(date_retrieve_limit, datefrom=today)
    durations = DateTime.getBetween([start_date, today],
                                    element='date',
                                    offset=24)['order']
    # offset 24 hour to include today
    state_data = fn.getNestedElement(data, 'state')
    facility_data_by_state = fn.getNestedElement(data, 'state_facility')

    check_data = combinedFacilityList(data=facility_data_by_state)
    result = getIntegrity(params={
        'durations': durations,
    },
                          data={
                              'facility': facility_data_by_state,
                              'state': state_data,
                              'to_update': result,
                              'check_data': check_data,
                          })
    updateStateData(result)
    result = list(sorted(result, key=lambda k: k['name'], reverse=False))
    Debug.end()
    Debug.show('Model.Structure.stockIntegrity')
    return result
Example #4
0
def generateTemplate(params):
	result = {};
	report_keys = fn.getNestedElement(params, 'keys.report', ['procurement', 'budget']);
	first_date = fn.getNestedElement(params, 'first_date');
	last_date = fn.getNestedElement(params, 'last_date');
	state_by = fn.getNestedElement(params, 'state_by');
	states = fn.getNestedElement(params, 'states');
	today = DateTime.now(tzinfo=msia_tz); # date only
	for rk in report_keys:

		if rk not in result:
			result[rk] = {};
		
		for date in DateTime.getBetween([first_date, last_date], element='date')['order']:
			end_date_of_month = DateTime.getDaysAgo(days_to_crawl=1, datefrom=DateTime.getNextMonth(DateTime.convertDateTimeFromString(date)));
			year_month = date[:7];
			day_diff = DateTime.getDifferenceBetweenDuration([today, end_date_of_month]);

			if day_diff >= 0:
				date_str = DateTime.toString(today);
			else:
				date_str = DateTime.toString(end_date_of_month);

			if date_str not in result[rk]:
				result[rk][date_str] = {};

			result[rk][date_str].update({
				'date': date_str,
			})
			for idx in range(0, len(states)):
				state = states[idx][state_by];
				result[rk][date_str].update({
					state: 0,
				});
	return result;
Example #5
0
def check(params):
    global msia_tz, date_retrieve_limit, date_count, collection_name
    dbManager = SharedMemoryManager.getInstance()
    db = dbManager.query()
    today = DateTime.now(tzinfo=msia_tz)
    start_date = DateTime.getDaysAgo(date_retrieve_limit, datefrom=today)
    durations = DateTime.getBetween([start_date, today],
                                    element='date',
                                    offset=24)['order']
    # offset 24 to include today
    Logger.v('durations', durations)
    data = db[collection_name].aggregate([{
        '$match': {
            'state_updated_at': {
                '$in': durations
            },
            'facility_updated_at': {
                '$in': durations
            }
        }
    }, {
        '$project': {
            '_id': 0,
            'inserted_at': 0,
            'updated_at': 0
        }
    }])
    data = list(data)
    Logger.v('Total stock issue integrity in', date_retrieve_limit, 'days:',
             len(data))
    state_data = {}
    facility_data_by_state = {}

    for idx in range(0, len(data)):
        row = data[idx]
        state_code = fn.getNestedElement(row, 'state_code')
        if state_code not in facility_data_by_state:
            facility_data_by_state[state_code] = {}

        state_data = addIntegrityData(data={
            'row': row,
            'to_update': state_data
        },
                                      category='state')
        facility_data_by_state[state_code] = addIntegrityData(
            data={
                'row': row,
                'to_update': facility_data_by_state[state_code]
            },
            category='facility')

        if date_count > date_retrieve_limit:  # limit loop data/ show data in N days
            break
        date_count = 0
        # reset to 0th day
    return {
        'state': state_data,
        'state_facility': facility_data_by_state,
    }
Example #6
0
def updateDropdownOptions(params):
    option_keys = fn.getNestedElement(params, 'keys.option', ['state'])
    today = fn.getNestedElement(
        params, 'schedule_params.today',
        DateTime.toString(DateTime.now(tzinfo=msia_tz)))
    data = {}
    crawled_data = {}
    # crawl from API URL (get options from API)
    # for key in keys:
    # 	url = api_links[key];
    # 	# url = generateUrl(api_links[key]);
    # 	response = requests.get(url);
    # 	json_response = json.loads(response.text);
    # 	Logger.v('json_response', json_response);
    # 	crawled_data[key] = json_response;
    # 	Logger.v('Crawled', url);
    # Logger.v('Done crawling.');
    # save(data);

    # read from file
    for key in option_keys:
        filename = api_files[key]
        crawled_data[key] = File.readJson(filename)

    # convert key to snakecase, value to lower
    for key in crawled_data:
        if key not in data:
            data[key] = []
        for idx in range(0, len(crawled_data[key])):
            row = crawled_data[key][idx]
            obj_ = {}
            for row_key in row:
                row_value = row[row_key]
                new_key = fn.camelToSnakecase(str=row_key)
                if type(row_value) == str:
                    new_value = row_value.lower()
                elif row_value is None:
                    new_value = 'null'
                else:
                    new_value = row_value
                obj_[new_key] = new_value

            data[key].append(obj_)

    for key in data:
        folder_path = '/'.join([crawl_folder, key])
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        filename = '{0}/{1}'.format(folder_path, today)
        Logger.v('Saving', filename)
        fn.writeJSONFile(filename='{0}.json'.format(filename), data=data[key])

    for key in option_keys:
        directory = '/'.join([crawl_folder, key])
        raw = File.readLatestFile(directory=directory)
        refresh_collection = refreshIsRequired(data=raw, collection_name=key)
        if refresh_collection:
            refreshCollection(data=raw, collection_name=key)
            Logger.v('refreshed', key)
Example #7
0
def checkEmpty(params):
	global global_check_data;
	dbManager = SharedMemoryManager.getInstance();
	db = dbManager.query();
	custom_params = copy.deepcopy(params);
	report_keys = fn.getNestedElement(params, 'keys.report', ['procurement', 'budget']);
	interval = fn.getNestedElement(params, 'interval', 1);
	past_dates = DateTime.getPastDate(count=12, duration=interval); # check previous 12 month data
	year = Crawl.extractYear(data=past_dates[0]);
	first_date = past_dates[0][-1][0];
	last_date = past_dates[0][0][1];
	# Logger.v('first_date', first_date, 'last_date', last_date);
	state_by = 'state_code';
	states = list(db['state'].find({},{'_id': 0, state_by: 1}));
	result = {};
	datetime = DateTime.toString(DateTime.now(tzinfo=msia_tz), date_format='%Y-%m-%d-%H-%M-%S');

	custom_params['first_date'] = first_date;
	custom_params['last_date'] = last_date;
	custom_params['state_by'] = state_by;
	custom_params['states'] = states;
	temp_result = generateTemplate(params=custom_params);

	for rk in report_keys:
		if rk not in global_check_data:
			global_check_data[rk] = [];

		for y in year:
			root_path = '{0}/{1}/year_{2}'.format(crawl_folder, rk, y);
			openDir(root_path, rk);
			for gcd in global_check_data[rk]:
				date = gcd.split('_')[0];
				state = gcd.split('_')[1];
				if DateTime.inrange(date, [first_date, last_date]):
					try:
						temp_result[rk][date][state] += 1;
					except Exception as e:
						# Logger.v('Main.checkEmpty:', e);
						pass;

	for rk in temp_result:
		if rk not in result:
			result[rk] = [];
		for date in temp_result[rk]:
			result[rk].append(temp_result[rk][date]);

		filename = '{0}/{1}_check_moh_empty'.format(test_folder, rk);
		# filename = 'tests/{0}_{1}_check_moh_empty'.format(rk, datetime);
		fn.writeExcelFile(filename=filename, data=result[rk]);
	global_check_data = {};
	return result;
Example #8
0
def run(params):
    Mail.send(
        '{0} Crawl - Store starts'.format(
            DateTime.getReadableDate(DateTime.now())),
        'Start at: {0}'.format(DateTime.now(tzinfo=msia_tz)))
    recordTime(key='create_schedule_start_time')
    Debug = DebugManager.DebugManager()
    Debug.start()
    start_crawl = fn.getNestedElement(params, 'schedule_params.start_crawl',
                                      False)
    check_empty = fn.getNestedElement(params, 'schedule_params.check_empty',
                                      False)
    Logger.v('Creating schedule:')
    updateDropdownOptions(params=params)
    crawl_params = generateCrawlParam(params)
    Debug.trace('Generate Crawl Params')
    createSchedules({'pages': crawl_params})
    Debug.trace('Create Schedule')
    if start_crawl:
        Crawl.start(params)
        Debug.trace('crawling')
    recordTime(key='create_schedule_end_time')
    Debug.show('Run')
Example #9
0
#!/usr/bin/python3

from Crawl import Main as Crawl
from lib import fn
from lib import DateTime

try:
    start_time = DateTime.now()
    fn.writeTestFile(
        filename='cron_start_{0}'.format(DateTime.now()),
        data='successful cron tab, cron time:{0}'.format(start_time))
    Crawl.run({
        'process': ['schedule', 'crawl',
                    'update'],  # 'schedule', 'crawl', 'update'
        'schedule_params': {
            'start_crawl': False,
            'check_empty': True,
        },
        'interval': 1,  # months between start date - end date 
        'keys': {
            'option': ['state', 'ptj', 'facility', 'facility_type'],
            'report': ['budget', 'procurement'],  # 'budget', 'procurement'
        },
        'filter': {
            'facility_code': False,
        },
    })
    end_time = DateTime.now()
    print('end_time', end_time)
    fn.writeTestFile(
        filename='cron_end_{0}'.format(end_time),
Example #10
0
def generateCrawlParam(params):
    Debug = DebugManager.DebugManager()
    Debug.start()
    global pass_month_quantity
    dbManager = SharedMemoryManager.getInstance()
    db = dbManager.query()
    crawl_params = {}
    limit_for_test = 10
    report_keys = fn.getNestedElement(params, 'keys.report',
                                      ['budget', 'procurement'])
    interval = fn.getNestedElement(params, 'interval', 1)
    filter_facility_code = fn.getNestedElement(params, 'filter.facility_code',
                                               True)
    check_empty = fn.getNestedElement(params, 'schedule_params.check_empty',
                                      False)
    today = fn.getNestedElement(
        params, 'schedule_params.today',
        DateTime.toString(DateTime.now(tzinfo=msia_tz)))
    # Logger.v('filter_facility_code', filter_facility_code);
    if check_empty:
        # past_dates = DateTime.getPastDate(count=pass_month_quantity, duration=interval);
        past_dates = DateTime.getPastDate(
            count=pass_month_quantity,
            duration=interval,
            end=DateTime.convertDateTimeFromString(today))
        # Logger.v('past_dates', past_dates);
        # exit();
    else:
        past_dates = DateTime.getPastDate(count=pass_month_quantity,
                                          duration=interval)

    # Logger.v('past_dates', past_dates);
    state_codes = retrieveOption(collection_name='state',
                                 show_keys=['state_code'],
                                 hide_keys=['_id'])
    state_code = extractListByKey(data=state_codes, key='state_code')
    facility_codes = retrieveOption(collection_name='facility',
                                    show_keys=['facility_code'],
                                    hide_keys=['_id'])
    facility_code = extractListByKey(data=facility_codes, key='facility_code')
    for key in report_keys:
        # Logger.v('collection', key, past_dates[0]);
        Debug.trace()
        if key not in crawl_params:
            crawl_params[key] = []
        mongo_data = list(db[key].find({}, {}))

        if len(mongo_data) == 0:
            dates = past_dates[0][:]
        else:
            dates = past_dates[0][:1]

        year = extractYear(data=dates)
        # Logger.v('year', year);
        # Logger.v('filter_facility_code', filter_facility_code);
        if key == 'budget':
            if not filter_facility_code:
                iteration = 0
                total = len(year) * len(state_code)
                # fn.printProgressBar(iteration=iteration, total=total);
                for y in year:
                    for sc in state_code:
                        obj_ = {
                            'financial_year': y,
                            'state_code': sc,
                            'page_type': key,
                            'upid': '_'.join([sc, y]),
                            'url': api_links[key].format(sc, y, ''),
                            'start_date': today,
                            'end_date': today,
                        }
                        if obj_ not in crawl_params[key]:
                            crawl_params[key].append(obj_)
                            # Logger.v('len(crawl_param])', len(crawl_params[key]));
                        iteration += 1
                        # fn.printProgressBar(iteration=iteration, total=total);
            else:
                iteration = 0
                total = len(year) * len(state_code) * len(
                    facility_code[:limit_for_test])
                # fn.printProgressBar(iteration=iteration, total=total);
                for y in year:
                    for sc in state_code:
                        for fc in facility_code[:limit_for_test]:
                            obj_ = {
                                'financial_year': y,
                                'state_code': sc,
                                'page_type': key,
                                'upid': '_'.join([sc, y, fc]),
                                'facility_code': fc,
                                'url': api_links[key].format(sc, y, fc),
                                'start_date': today,
                                'end_date': today,
                            }
                            if obj_ not in crawl_params[key]:
                                crawl_params[key].append(obj_)
                                # Logger.v('len(crawl_param])', len(crawl_params[key]));
                            iteration += 1
                            # fn.printProgressBar(iteration=iteration, total=total);

        elif key == 'procurement':
            if not filter_facility_code:
                for past_duration in dates:
                    start_date = DateTime.toString(
                        DateTime.getDaysAgo(days_to_crawl=-1,
                                            datefrom=past_duration[0]))
                    end_date = DateTime.toString(
                        DateTime.getDaysAgo(days_to_crawl=1,
                                            datefrom=past_duration[1]))
                    for sc in state_code:
                        obj_ = {
                            'state_code':
                            sc,
                            'start_date':
                            start_date,
                            'end_date':
                            end_date,
                            'page_type':
                            key,
                            'upid':
                            '_'.join([sc, start_date, end_date]),
                            'url':
                            api_links[key].format(sc,
                                                  start_date.replace('-', ''),
                                                  end_date.replace('-', ''),
                                                  ''),
                        }

                        if obj_ not in crawl_params[key]:
                            crawl_params[key].append(obj_)
                            # Logger.v('len(crawl_param])', len(crawl_params[key]));
            else:
                for past_duration in dates:
                    start_date = DateTime.toString(
                        DateTime.getDaysAgo(days_to_crawl=-1,
                                            datefrom=past_duration[0]))
                    end_date = DateTime.toString(
                        DateTime.getDaysAgo(days_to_crawl=1,
                                            datefrom=past_duration[1]))
                    for sc in state_code:
                        for fc in facility_code[:limit_for_test]:
                            obj_ = {
                                'state_code':
                                sc,
                                'start_date':
                                start_date,
                                'end_date':
                                end_date,
                                'page_type':
                                key,
                                'facility_code':
                                fc,
                                'upid':
                                '_'.join([sc, start_date, end_date, fc]),
                                'url':
                                api_links[key].format(
                                    sc, start_date.replace('-', ''),
                                    end_date.replace('-', ''), fc)
                            }
                            if obj_ not in crawl_params[key]:
                                crawl_params[key].append(obj_)
                                # Logger.v('len(crawl_param])', len(crawl_params[key]));

    for c in crawl_params:
        # Logger.v('crawl_params', c, len(crawl_params[c]));
        fn.writeExcelFile(filename='{0}/{1}'.format(test_folder, c),
                          data=crawl_params[c])
    Logger.v('crawl_params', len(crawl_params))
    Debug.show('Generate Crawl Params')
    return crawl_params