Example #1
0
def save(params, chunk, chunks_info):
    global latest_collection_name, history_collection_name

    data = File.readChunkData(chunk)
    dbManager = SharedMemoryManager.getInstance()
    db = dbManager.query()
    current_index = fn.getNestedElement(chunks_info, 'current', 0)
    total_index = fn.getNestedElement(chunks_info, 'total', len(data))

    date = fn.getNestedElement(params, 'date')
    datetime = DateTime.convertDateTimeFromString(date)
    total_length = len(data)
    queue_info = chunks_info['queue']
    # Logger.v('Running Index:', chunks_info['queue']['running']);
    chunks_info['queue']['current'] += 1
    # Logger.v('Saving from... {0}/{1}, current package: {2}'.format(current_index, total_index, total_length) );
    fn.printProgressBar(queue_info['current'], queue_info['total'],
                        'Processing Chunk Insertion')
    for idx in range(0, total_length):
        # insert stock_latest
        row = data[idx]
        obj_ = transformToLowercase(data=row, datetime=datetime)
        ModelStockIntegrity.update(data=obj_)
        dbManager.addBulkInsert(latest_collection_name, obj_, batch=True)
        # dbManager.addBulkInsert(history_collection_name, obj_, batch=True); # temporary off (need 7 day data only)

        # insert items
        # d = data[idx];
        ModelItem.saveItem(row)
        # fn.printProgressBar(current_index+idx, total_index, 'Processing Item Insertion');

    #ensure all data is save properly
    # dbManager.executeBulkOperations(history_collection_name); # temporary off (need 7 day data only)
    dbManager.executeBulkOperations(latest_collection_name)
    return chunks_info
Example #2
0
def getMonthRange(params):
	start_month = fn.getNestedElement(params, 'start_month');
	number_of_month = fn.getNestedElement(params, 'number_of_month', 1);
	month_range = [start_month];
	new_month = '{0}-01'.format(start_month);
	for month_count in range(0, number_of_month - 1): # included start_month, so total month less 1
		new_month = DateTime.getNextMonth(DateTime.convertDateTimeFromString(new_month));
		year_month = DateTime.getDateCategoryName(new_month, element='year_month_digit');
		month_range.append(year_month);
	return month_range;
Example #3
0
def getCollectionName(params):
    global latest_collection_name
    latest_collection_name = 'stock_latest'
    # set default;
    dbManager = SharedMemoryManager.getInstance()
    db = dbManager.query()
    data = list(db[latest_collection_name].find({}, {
        '_id': 0,
        'date': 1
    }).limit(1))
    if data:
        latest_date_string = DateTime.toString(data[0]['date'])
        latest_date = DateTime.convertDateTimeFromString(latest_date_string)
        date_string = fn.getNestedElement(params, 'date', None)
        if date_string:
            date = DateTime.convertDateTimeFromString(date_string)
            different = latest_date - date
            day_diff = math.floor(different.total_seconds() / float(86400))
            if day_diff > 0:
                latest_collection_name = 'stock_{0}'.format(day_diff)
Example #4
0
def getMissingDates(data):
	dbManager = SharedMemoryManager.getInstance();
	db = dbManager.query();
	missing_dates = {};
	today = DateTime.now(tzinfo=msia_tz); # date only
	state_by = 'state_code';
	states = list(db['state'].find({},{'_id': 0, state_by: 1}));
	current_year = DateTime.getDateCategoryName(date=DateTime.now(tzinfo=msia_tz), element='year');
	for rk in data:
		row = data[rk];
		if rk not in missing_dates:
			missing_dates[rk] = [];

		dates = groupDates(params={'states': states, 'state_by': state_by}, data=row);
		for date in dates['missing']:
			end_date_of_month = DateTime.getDaysAgo(days_to_crawl=1, datefrom=DateTime.getNextMonth(DateTime.convertDateTimeFromString(date)));
			day_diff = DateTime.getDifferenceBetweenDuration([today, end_date_of_month]);

			if day_diff >= 0:
				date_str = DateTime.toString(today);
			else:
				date_str = DateTime.toString(end_date_of_month);

			if date_str not in dates['crawled']:
				missing_dates[rk].append(date_str);

			# Logger.v('day_diff', day_diff);
			# Logger.v('date', DateTime.getDaysAgo(days_to_crawl=1, datefrom=DateTime.getNextMonth(DateTime.convertDateTimeFromString(ed))));
		missing_dates[rk] = sorted(list(set(missing_dates[rk])), reverse=True);
	return missing_dates;
Example #5
0
def generateTemplate(params):
	result = {};
	report_keys = fn.getNestedElement(params, 'keys.report', ['procurement', 'budget']);
	first_date = fn.getNestedElement(params, 'first_date');
	last_date = fn.getNestedElement(params, 'last_date');
	state_by = fn.getNestedElement(params, 'state_by');
	states = fn.getNestedElement(params, 'states');
	today = DateTime.now(tzinfo=msia_tz); # date only
	for rk in report_keys:

		if rk not in result:
			result[rk] = {};
		
		for date in DateTime.getBetween([first_date, last_date], element='date')['order']:
			end_date_of_month = DateTime.getDaysAgo(days_to_crawl=1, datefrom=DateTime.getNextMonth(DateTime.convertDateTimeFromString(date)));
			year_month = date[:7];
			day_diff = DateTime.getDifferenceBetweenDuration([today, end_date_of_month]);

			if day_diff >= 0:
				date_str = DateTime.toString(today);
			else:
				date_str = DateTime.toString(end_date_of_month);

			if date_str not in result[rk]:
				result[rk][date_str] = {};

			result[rk][date_str].update({
				'date': date_str,
			})
			for idx in range(0, len(states)):
				state = states[idx][state_by];
				result[rk][date_str].update({
					state: 0,
				});
	return result;
Example #6
0
def generateCrawlParam(params):
    Debug = DebugManager.DebugManager()
    Debug.start()
    global pass_month_quantity
    dbManager = SharedMemoryManager.getInstance()
    db = dbManager.query()
    crawl_params = {}
    limit_for_test = 10
    report_keys = fn.getNestedElement(params, 'keys.report',
                                      ['budget', 'procurement'])
    interval = fn.getNestedElement(params, 'interval', 1)
    filter_facility_code = fn.getNestedElement(params, 'filter.facility_code',
                                               True)
    check_empty = fn.getNestedElement(params, 'schedule_params.check_empty',
                                      False)
    today = fn.getNestedElement(
        params, 'schedule_params.today',
        DateTime.toString(DateTime.now(tzinfo=msia_tz)))
    # Logger.v('filter_facility_code', filter_facility_code);
    if check_empty:
        # past_dates = DateTime.getPastDate(count=pass_month_quantity, duration=interval);
        past_dates = DateTime.getPastDate(
            count=pass_month_quantity,
            duration=interval,
            end=DateTime.convertDateTimeFromString(today))
        # Logger.v('past_dates', past_dates);
        # exit();
    else:
        past_dates = DateTime.getPastDate(count=pass_month_quantity,
                                          duration=interval)

    # Logger.v('past_dates', past_dates);
    state_codes = retrieveOption(collection_name='state',
                                 show_keys=['state_code'],
                                 hide_keys=['_id'])
    state_code = extractListByKey(data=state_codes, key='state_code')
    facility_codes = retrieveOption(collection_name='facility',
                                    show_keys=['facility_code'],
                                    hide_keys=['_id'])
    facility_code = extractListByKey(data=facility_codes, key='facility_code')
    for key in report_keys:
        # Logger.v('collection', key, past_dates[0]);
        Debug.trace()
        if key not in crawl_params:
            crawl_params[key] = []
        mongo_data = list(db[key].find({}, {}))

        if len(mongo_data) == 0:
            dates = past_dates[0][:]
        else:
            dates = past_dates[0][:1]

        year = extractYear(data=dates)
        # Logger.v('year', year);
        # Logger.v('filter_facility_code', filter_facility_code);
        if key == 'budget':
            if not filter_facility_code:
                iteration = 0
                total = len(year) * len(state_code)
                # fn.printProgressBar(iteration=iteration, total=total);
                for y in year:
                    for sc in state_code:
                        obj_ = {
                            'financial_year': y,
                            'state_code': sc,
                            'page_type': key,
                            'upid': '_'.join([sc, y]),
                            'url': api_links[key].format(sc, y, ''),
                            'start_date': today,
                            'end_date': today,
                        }
                        if obj_ not in crawl_params[key]:
                            crawl_params[key].append(obj_)
                            # Logger.v('len(crawl_param])', len(crawl_params[key]));
                        iteration += 1
                        # fn.printProgressBar(iteration=iteration, total=total);
            else:
                iteration = 0
                total = len(year) * len(state_code) * len(
                    facility_code[:limit_for_test])
                # fn.printProgressBar(iteration=iteration, total=total);
                for y in year:
                    for sc in state_code:
                        for fc in facility_code[:limit_for_test]:
                            obj_ = {
                                'financial_year': y,
                                'state_code': sc,
                                'page_type': key,
                                'upid': '_'.join([sc, y, fc]),
                                'facility_code': fc,
                                'url': api_links[key].format(sc, y, fc),
                                'start_date': today,
                                'end_date': today,
                            }
                            if obj_ not in crawl_params[key]:
                                crawl_params[key].append(obj_)
                                # Logger.v('len(crawl_param])', len(crawl_params[key]));
                            iteration += 1
                            # fn.printProgressBar(iteration=iteration, total=total);

        elif key == 'procurement':
            if not filter_facility_code:
                for past_duration in dates:
                    start_date = DateTime.toString(
                        DateTime.getDaysAgo(days_to_crawl=-1,
                                            datefrom=past_duration[0]))
                    end_date = DateTime.toString(
                        DateTime.getDaysAgo(days_to_crawl=1,
                                            datefrom=past_duration[1]))
                    for sc in state_code:
                        obj_ = {
                            'state_code':
                            sc,
                            'start_date':
                            start_date,
                            'end_date':
                            end_date,
                            'page_type':
                            key,
                            'upid':
                            '_'.join([sc, start_date, end_date]),
                            'url':
                            api_links[key].format(sc,
                                                  start_date.replace('-', ''),
                                                  end_date.replace('-', ''),
                                                  ''),
                        }

                        if obj_ not in crawl_params[key]:
                            crawl_params[key].append(obj_)
                            # Logger.v('len(crawl_param])', len(crawl_params[key]));
            else:
                for past_duration in dates:
                    start_date = DateTime.toString(
                        DateTime.getDaysAgo(days_to_crawl=-1,
                                            datefrom=past_duration[0]))
                    end_date = DateTime.toString(
                        DateTime.getDaysAgo(days_to_crawl=1,
                                            datefrom=past_duration[1]))
                    for sc in state_code:
                        for fc in facility_code[:limit_for_test]:
                            obj_ = {
                                'state_code':
                                sc,
                                'start_date':
                                start_date,
                                'end_date':
                                end_date,
                                'page_type':
                                key,
                                'facility_code':
                                fc,
                                'upid':
                                '_'.join([sc, start_date, end_date, fc]),
                                'url':
                                api_links[key].format(
                                    sc, start_date.replace('-', ''),
                                    end_date.replace('-', ''), fc)
                            }
                            if obj_ not in crawl_params[key]:
                                crawl_params[key].append(obj_)
                                # Logger.v('len(crawl_param])', len(crawl_params[key]));

    for c in crawl_params:
        # Logger.v('crawl_params', c, len(crawl_params[c]));
        fn.writeExcelFile(filename='{0}/{1}'.format(test_folder, c),
                          data=crawl_params[c])
    Logger.v('crawl_params', len(crawl_params))
    Debug.show('Generate Crawl Params')
    return crawl_params